In [2]:
import pandas as pd
df = pd.read_csv("Maternal Health Risk Data Set.csv")
print(df.head())


   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate  RiskLevel
0   25         130           80  15.0      98.0         86  high risk
1   35         140           90  13.0      98.0         70  high risk
2   29          90           70   8.0     100.0         80  high risk
3   30         140           85   7.0      98.0         70  high risk
4   35         120           60   6.1      98.0         76   low risk


In [None]:


import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset (path for Kaggle environment)
df = pd.read_csv("E:/fortransferee/mlproject6-p/Maternal Health Risk Data Set.csv")
# Encode target (RiskLevel)
label_encoder = LabelEncoder()
df['RiskLevel'] = label_encoder.fit_transform(df['RiskLevel'])

# Features & target
X = df.drop(columns=['RiskLevel'])
y = df['RiskLevel']

# Detect categorical & numerical columns
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()

# Preprocessing
numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

# ==========================
# 🔹 Candidate Models (with Tuned Versions)
# ==========================
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier(max_iter=500),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# Hyperparameter grids for tuning
rf_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}

xgb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

# ==========================
# 🔹 Train/Test Split
# ==========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================
# 🔹 Evaluate Models with or without Tuning
# ==========================
best_model = None
best_acc = 0

print("Evaluating Models...\n")

# Evaluate standard models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_model = name

# Evaluate tuned models
print("\nPerforming Hyperparameter Tuning...")

# Tuned Random Forest
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_pred = rf_grid.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(f"Tuned Random Forest Accuracy: {rf_acc:.4f}")
print("Best Params:", rf_grid.best_params_)
if rf_acc > best_acc:
    best_acc = rf_acc
    best_model = "Tuned Random Forest"

# Tuned XGBoost
xgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])
xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_pred = xgb_grid.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(f"Tuned XGBoost Accuracy: {xgb_acc:.4f}")
print("Best Params:", xgb_grid.best_params_)
if xgb_acc > best_acc:
    best_acc = xgb_acc
    best_model = "Tuned XGBoost"

print("\n✅ Overall Best Model:", best_model, "with Accuracy:", best_acc)

# Show label encoding mapping for clarity
print("\nLabel Encoding of RiskLevel:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Evaluating Models...

Logistic Regression Accuracy: 0.6207
Decision Tree Accuracy: 0.8177
Gradient Boosting Accuracy: 0.7685
Support Vector Machine Accuracy: 0.7241
K-Nearest Neighbors Accuracy: 0.6700



Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.


X does not have valid feature names, but LGBMClassifier was fitted with feature names



Neural Network Accuracy: 0.6847
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 811, number of used features: 6
[LightGBM] [Info] Start training from score -1.318371
[LightGBM] [Info] Start training from score -0.914443
[LightGBM] [Info] Start training from score -1.103557
LightGBM Accuracy: 0.8424
CatBoost Accuracy: 0.8571

Performing Hyperparameter Tuning...
Tuned Random Forest Accuracy: 0.8571
Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}



Parameters: { "use_label_encoder" } are not used.




Tuned XGBoost Accuracy: 0.8571
Best Params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 7, 'classifier__n_estimators': 300}

✅ Overall Best Model: CatBoost with Accuracy: 0.8571428571428571

Label Encoding of RiskLevel: {'high risk': np.int64(0), 'low risk': np.int64(1), 'mid risk': np.int64(2)}


In [None]:

import pickle

# Build pipeline with preprocessing + fitted CatBoost model
catboost_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', models["CatBoost"])
])

# Fit pipeline on training data
catboost_pipeline.fit(X_train, y_train)

# Save as pickle file
pickle_filename = "maternal_health_catboost_best.pkl"
with open(pickle_filename, "wb") as f:
    pickle.dump(catboost_pipeline, f)

print(f"✅ CatBoost model saved as {pickle_filename}")


✅ CatBoost model saved as maternal_health_catboost_best.pkl


In [None]:
import pandas as pd

try:
    df = pd.read_csv("Maternal Health Risk Data Set.csv")
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Maternal Health Risk Data Set.csv' not found. Please ensure the file is in the same directory.")
    exit()

high_risk_samples = df[df['RiskLevel'] == 'mid risk']

# Get the count of these samples
number_of_high_risk_samples = len(high_risk_samples)

print(f"\nNumber of 'high risk' samples in the dataset: {number_of_high_risk_samples}")


Dataset loaded successfully!

Number of 'high risk' samples in the dataset: 336


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import pickle


# Load dataset
df = pd.read_csv("Maternal Health Risk Data Set.csv")
# Encode target (RiskLevel)
label_encoder = LabelEncoder()
df['RiskLevel'] = label_encoder.fit_transform(df['RiskLevel'])

# Features & target
X = df.drop(columns=['RiskLevel'])
y = df['RiskLevel']

# Detect categorical & numerical columns
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])


models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier(max_iter=500),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# Hyperparameter grids for tuning
rf_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}
xgb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

best_model_name = None
best_acc = 0
best_model_object = None

print("Evaluating Models...\n")

# Evaluate standard models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_model_name = name
        best_model_object = pipeline

# Evaluate tuned models
print("\nPerforming Hyperparameter Tuning...")

# Tuned Random Forest
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_pred = rf_grid.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(f"Tuned Random Forest Accuracy: {rf_acc:.4f}")
if rf_acc > best_acc:
    best_acc = rf_acc
    best_model_name = "Tuned Random Forest"
    best_model_object = rf_grid

# Tuned XGBoost
xgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])
xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_pred = xgb_grid.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(f"Tuned XGBoost Accuracy: {xgb_acc:.4f}")
if xgb_acc > best_acc:
    best_acc = xgb_acc
    best_model_name = "Tuned XGBoost"
    best_model_object = xgb_grid

print(f"\nOverall Best Model: {best_model_name} with Accuracy: {best_acc:.4f}")


if best_model_object:
    pickle_filename = "best_model.pkl"
    with open(pickle_filename, 'wb') as file:
        pickle.dump(best_model_object, file)
    print(f" Best model saved to {pickle_filename}")
else:
    print(" No best model found to save.")


print("\n--- Making Predictions with the Best Model ---")

# Define manual input data examples
data_examples = [
    {
        "name": "Low Risk Example",
        'Age': [25.0],
        'SystolicBP': [120.0],
        'DiastolicBP': [80.0],
        'BS': [6.0],
        'BodyTemp': [98.6],
        'HeartRate': [80.0]
    },
    {
        "name": "Mid Risk Example",
        'Age': [35.0],
        'SystolicBP': [130.0],
        'DiastolicBP': [85.0],
        'BS': [8.0],
        'BodyTemp': [99.5],
        'HeartRate': [95.0]
    },
    {
        "name": "High Risk Example",
        'Age': [40.0],
        'SystolicBP': [145.0],
        'DiastolicBP': [95.0],
        'BS': [12.0],
        'BodyTemp': [100.2],
        'HeartRate': [110.0]
    }
]

# Map the numerical prediction back to the risk level string
risk_map = {0: 'high risk', 1: 'low risk', 2: 'mid risk'}

# Predict for each example using the best model
for example in data_examples:
    print(f"\n--- Prediction for: {example['name']} ---")
    
    # Create a DataFrame from the manual input
    input_df = pd.DataFrame(example)

    # Use the best model pipeline to make a prediction
    prediction_array = best_model_object.predict(input_df)
    
    # Extract the single integer value from the prediction array
    prediction = int(prediction_array[0])

    # Interpret and print the prediction
    predicted_risk = risk_map.get(prediction, "Unknown")
    print(f"Input Data:\n{input_df.to_string(index=False)}")
    print(f"Predicted Risk Level: {predicted_risk}")


Evaluating Models...

Logistic Regression Accuracy: 0.6207
Decision Tree Accuracy: 0.8424
Gradient Boosting Accuracy: 0.7685
Support Vector Machine Accuracy: 0.7241
K-Nearest Neighbors Accuracy: 0.6700
Neural Network Accuracy: 0.7044
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 811, number of used features: 6
[LightGBM] [Info] Start training from score -1.318371
[LightGBM] [Info] Start training from score -0.914443
[LightGBM] [Info] Start training from score -1.103557
LightGBM Accuracy: 0.8424




CatBoost Accuracy: 0.8571

Performing Hyperparameter Tuning...
Tuned Random Forest Accuracy: 0.8571


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Tuned XGBoost Accuracy: 0.8571

✅ Overall Best Model: CatBoost with Accuracy: 0.8571
✅ Best model saved to best_model.pkl

--- Making Predictions with the Best Model ---

--- Prediction for: Low Risk Example ---
Input Data:
            name  Age  SystolicBP  DiastolicBP  BS  BodyTemp  HeartRate
Low Risk Example 25.0       120.0         80.0 6.0      98.6       80.0
Predicted Risk Level: low risk

--- Prediction for: Mid Risk Example ---
Input Data:
            name  Age  SystolicBP  DiastolicBP  BS  BodyTemp  HeartRate
Mid Risk Example 35.0       130.0         85.0 8.0      99.5       95.0
Predicted Risk Level: high risk

--- Prediction for: High Risk Example ---
Input Data:
             name  Age  SystolicBP  DiastolicBP   BS  BodyTemp  HeartRate
High Risk Example 40.0       145.0         95.0 12.0     100.2      110.0
Predicted Risk Level: high risk


  prediction = int(prediction_array[0])
  prediction = int(prediction_array[0])
  prediction = int(prediction_array[0])


In [None]:
import pandas as pd
import pickle

# ==========================
# 📌 Load the Best Model
# ==========================
# Ensure your 'best_model.pkl' file is in the same directory.
try:
    with open('best_model.pkl', 'rb') as file:
        model_pipeline = pickle.load(file)
    print(" Model loaded successfully from 'best_model.pkl'!")
except FileNotFoundError:
    print(" Error: 'best_model.pkl' not found.")
    print("Please ensure the file is in the same directory and try again.")
    # Exit the script if the model file is not found
    exit()

# ==========================
# 📌 Define Manual Input Data Examples
# ==========================
# A list of dictionaries, each representing a different patient profile.
# The features are: Age, SystolicBP, DiastolicBP, BS, BodyTemp, HeartRate
data_examples = [
    {
        "name": "Low Risk Example",
        'Age': [25.0],
        'SystolicBP': [120.0],
        'DiastolicBP': [80.0],
        'BS': [6.0],
        'BodyTemp': [98.6],
        'HeartRate': [80.0]
    },
    {
        "name": "Mid Risk Example",
        'Age': [35.0],
        'SystolicBP': [130.0],
        'DiastolicBP': [85.0],
        'BS': [8.0],
        'BodyTemp': [99.5],
        'HeartRate': [95.0]
    },
    {
        "name": "High Risk Example",
        'Age': [35.0],
        'SystolicBP': [140.0],
        'DiastolicBP': [100.0],
        'BS': [9.0],
        'BodyTemp': [98.0],
        'HeartRate': [66.0]
    }
]

# Map the numerical prediction back to the risk level string.
risk_map = {0: 'high risk', 1: 'low risk', 2: 'mid risk'}

for example in data_examples:
    print(f"\n--- Making a prediction for: {example['name']} ---")
    
    # Create a DataFrame from the manual input.
    input_df = pd.DataFrame({
        'Age': example['Age'],
        'SystolicBP': example['SystolicBP'],
        'DiastolicBP': example['DiastolicBP'],
        'BS': example['BS'],
        'BodyTemp': example['BodyTemp'],
        'HeartRate': example['HeartRate']
    })

    print(f"Input Data:\n{input_df.to_string(index=False)}")

    # Use the loaded model pipeline to make a prediction.
    prediction_array = model_pipeline.predict(input_df)
    
    # Extract the single integer value from the prediction array.
    prediction = int(prediction_array[0])

    # Interpret the prediction using the risk map.
    predicted_risk = risk_map.get(prediction, "Unknown")
    print(f"Predicted Risk Level: {predicted_risk}")


✅ Model loaded successfully from 'best_model.pkl'!

--- Making a prediction for: Low Risk Example ---
Input Data:
 Age  SystolicBP  DiastolicBP  BS  BodyTemp  HeartRate
25.0       120.0         80.0 6.0      98.6       80.0
Predicted Risk Level: low risk

--- Making a prediction for: Mid Risk Example ---
Input Data:
 Age  SystolicBP  DiastolicBP  BS  BodyTemp  HeartRate
35.0       130.0         85.0 8.0      99.5       95.0
Predicted Risk Level: high risk

--- Making a prediction for: High Risk Example ---
Input Data:
 Age  SystolicBP  DiastolicBP  BS  BodyTemp  HeartRate
35.0       140.0        100.0 9.0      98.0       66.0
Predicted Risk Level: high risk


  prediction = int(prediction_array[0])
  prediction = int(prediction_array[0])
  prediction = int(prediction_array[0])
