In [2]:
pip install ydata_profiling



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as yp
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error

In [4]:
dataset = pd.read_csv("/content/post_natal_dataset.csv")

In [5]:
profile =yp.ProfileReport(dataset, title="EDA Report", explorative =True)
profile.to_file("eda_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:07<00:00,  1.55it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
if "Timestamp" in dataset.columns:
    dataset = dataset.drop(columns=["Timestamp"])

In [7]:
dataset.head()

Unnamed: 0,Age,Feeling sad or Tearful,Irritable towards baby & partner,Trouble sleeping at night,Problems concentrating or making decision,Overeating or loss of appetite,Feeling anxious,Feeling of guilt,Problems of bonding with baby,Suicide attempt
0,30-35,Sometimes,Sometimes,No,Maybe,Yes,Yes,Sometimes,Yes,Yes
1,30-35,Maybe,Yes,Two or more days a week,Maybe,Yes,Yes,No,No,Yes
2,35-40,Yes,No,Yes,Maybe,Maybe,Sometimes,No,Yes,No
3,45-50,Maybe,Maybe,No,Sometimes,Yes,Maybe,Maybe,Sometimes,No
4,40-45,Yes,Maybe,No,No,Yes,Sometimes,Maybe,Maybe,Yes


In [8]:
print(dataset.isna().sum())

Age                                          0
Feeling sad or Tearful                       0
Irritable towards baby & partner             0
Trouble sleeping at night                    0
Problems concentrating or making decision    0
Overeating or loss of appetite               0
Feeling anxious                              0
Feeling of guilt                             0
Problems of bonding with baby                0
Suicide attempt                              0
dtype: int64


In [9]:
duplicates_count = dataset.duplicated().sum()
print(f"Number of duplicate rows: {duplicates_count}")
if duplicates_count > 0:
    dataset = dataset.drop_duplicates()
    print(f"Duplicates removed. New shape: {dataset.shape}")
else:
    print("No duplicate rows found.")

Number of duplicate rows: 62
Duplicates removed. New shape: (8938, 10)


In [10]:
dataset_encoded = dataset.copy()
for col in dataset.columns:
    if dataset_encoded[col].dtype == "object":
        dataset_encoded[col] = dataset_encoded[col].astype(str).str.strip().str.lower()
        mapping = {"yes": 2, "no": 0, "Two or more days a week": 2.5, "sometimes": 1.5, "maybe": 1, "always": 3}
        dataset_encoded[col] = dataset_encoded[col].map(mapping).fillna(dataset_encoded[col])
        dataset_encoded[col] = pd.to_numeric(dataset_encoded[col], errors="coerce").fillna(0)

In [11]:
symptom_cols = [c for c in dataset_encoded.columns if c.lower() not in ["age", "depression_label", "depression_score"]]
print("Columns used for depression score calculation:", symptom_cols)
dataset_encoded["depression_score"] = dataset_encoded[symptom_cols].sum(axis=1)

Columns used for depression score calculation: ['Feeling sad or Tearful', 'Irritable towards baby & partner', 'Trouble sleeping at night', 'Problems concentrating or making decision', 'Overeating or loss of appetite', 'Feeling anxious', 'Feeling of guilt', 'Problems of bonding with baby', 'Suicide attempt']


In [12]:
threshold = 7
dataset_encoded["depression_label"] = (dataset_encoded["depression_score"] >= threshold).astype(int)

In [13]:
print("\nLabel distribution:")
print(dataset_encoded["depression_label"].value_counts())


Label distribution:
depression_label
1    7561
0    1377
Name: count, dtype: int64


In [14]:
X = dataset_encoded.drop(columns=["depression_label", "depression_score"])
y = dataset_encoded["depression_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data split into training and testing sets.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Data split into training and testing sets.
X_train shape: (7150, 10)
X_test shape: (1788, 10)
y_train shape: (7150,)
y_test shape: (1788,)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVC": SVC()
}

In [16]:
for name, model in models.items():
    print(f"Training {name} with engineered features...")
    model.fit(X_train, y_train) # Use X_train and y_train with engineered features
    models[name] = model
    print(f"{name} trained.")

Training Logistic Regression with engineered features...
Logistic Regression trained.
Training Decision Tree with engineered features...
Decision Tree trained.
Training Random Forest with engineered features...
Random Forest trained.
Training SVC with engineered features...
SVC trained.


In [17]:
from sklearn.metrics import mean_squared_error

results_engineered = {} # Store results with engineered features separately
for name, model in models.items():
    y_pred = model.predict(X_test) # Use X_test with engineered features
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results_engineered[name] = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1, "mse": mse}
    print(f"\n{name} Performance (Engineered Features):")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  MSE: {mse:.4f}")


Logistic Regression Performance (Engineered Features):
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  MSE: 0.0000

Decision Tree Performance (Engineered Features):
  Accuracy: 0.9390
  Precision: 0.9686
  Recall: 0.9590
  F1-score: 0.9638
  MSE: 0.0610

Random Forest Performance (Engineered Features):
  Accuracy: 0.9536
  Precision: 0.9643
  Recall: 0.9815
  F1-score: 0.9728
  MSE: 0.0464

SVC Performance (Engineered Features):
  Accuracy: 0.9944
  Precision: 0.9934
  Recall: 1.0000
  F1-score: 0.9967
  MSE: 0.0056


In [18]:
from sklearn.ensemble import StackingClassifier

# Define base models
estimators = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier())
]

# Define the stacking classifier
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

print("Stacking Classifier created.")

Stacking Classifier created.


In [19]:
from sklearn.ensemble import VotingClassifier

# Define base models for the Voting Classifier
# Using a different combination than the Stacking Classifier
estimators_voting = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('svc', SVC(probability=True)) # SVC needs probability=True for soft voting
]

# Instantiate a VotingClassifier with soft voting
voting_model = VotingClassifier(estimators=estimators_voting, voting='soft')
print("Voting Classifier created.")

Voting Classifier created.


In [20]:
print("Training Stacking Classifier with engineered features...")
stacking_model.fit(X_train, y_train) # Use X_train and y_train with engineered features
print("Stacking Classifier trained.")

print("Training Voting Classifier with engineered features...")
voting_model.fit(X_train, y_train) # Use X_train and y_train with engineered features
print("Voting Classifier trained.")

Training Stacking Classifier with engineered features...
Stacking Classifier trained.
Training Voting Classifier with engineered features...
Voting Classifier trained.


In [21]:
y_pred_stacking = stacking_model.predict(X_test) # Use X_test with engineered features

stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_precision = precision_score(y_test, y_pred_stacking)
stacking_recall = recall_score(y_test, y_pred_stacking)
stacking_f1 = f1_score(y_test, y_pred_stacking)
stacking_mse = mean_squared_error(y_test, y_pred_stacking)

print("\nStacking Classifier Performance (Engineered Features):")
print(f"  Accuracy: {stacking_accuracy:.4f}")
print(f"  Precision: {stacking_precision:.4f}")
print(f"  Recall: {stacking_recall:.4f}")
print(f"  F1-score: {stacking_f1:.4f}")
print(f"  MSE: {stacking_mse:.4f}")

y_pred_voting = voting_model.predict(X_test) # Use X_test with engineered features

voting_accuracy = accuracy_score(y_test, y_pred_voting)
voting_precision = precision_score(y_test, y_pred_voting)
voting_recall = recall_score(y_test, y_pred_voting)
voting_f1 = f1_score(y_test, y_pred_voting)
voting_mse = mean_squared_error(y_test, y_pred_voting)

print("\nVoting Classifier Performance (Engineered Features):")
print(f"  Accuracy: {voting_accuracy:.4f}")
print(f"  Precision: {voting_precision:.4f}")
print(f"  Recall: {voting_recall:.4f}")
print(f"  F1-score: {voting_f1:.4f}")
print(f"  MSE: {voting_mse:.4f}")

results_engineered["Stacking Classifier"] = {"accuracy": stacking_accuracy, "precision": stacking_precision, "recall": stacking_recall, "f1_score": stacking_f1, "mse": stacking_mse}
results_engineered["Voting Classifier"] = {"accuracy": voting_accuracy, "precision": voting_precision, "recall": voting_recall, "f1_score": voting_f1, "mse": voting_mse}


Stacking Classifier Performance (Engineered Features):
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  MSE: 0.0000

Voting Classifier Performance (Engineered Features):
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  MSE: 0.0000


In [22]:
print("Comparison of Best Individual Model (Logistic Regression) and Best Hybrid Model (Stacking Classifier):")

print("\nLogistic Regression Performance (Engineered Features):")
print(f"  Accuracy: {results_engineered['Logistic Regression']['accuracy']:.4f}")
print(f"  Precision: {results_engineered['Logistic Regression']['precision']:.4f}")
print(f"  Recall: {results_engineered['Logistic Regression']['recall']:.4f}")
print(f"  F1-score: {results_engineered['Logistic Regression']['f1_score']:.4f}")
print(f"  MSE: {results_engineered['Logistic Regression']['mse']:.4f}")

print("\nStacking Classifier Performance (Engineered Features):")
print(f"  Accuracy: {results_engineered['Stacking Classifier']['accuracy']:.4f}")
print(f"  Precision: {results_engineered['Stacking Classifier']['precision']:.4f}")
print(f"  Recall: {results_engineered['Stacking Classifier']['recall']:.4f}")
print(f"  F1-score: {results_engineered['Stacking Classifier']['f1_score']:.4f}")
print(f"  MSE: {results_engineered['Stacking Classifier']['mse']:.4f}")

Comparison of Best Individual Model (Logistic Regression) and Best Hybrid Model (Stacking Classifier):

Logistic Regression Performance (Engineered Features):
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  MSE: 0.0000

Stacking Classifier Performance (Engineered Features):
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  MSE: 0.0000


In [23]:
results_df = pd.DataFrame.from_dict(results_engineered, orient="index")
print("\nComparison Table of Model Performance (Engineered Features):")
display(results_df)


Comparison Table of Model Performance (Engineered Features):


Unnamed: 0,accuracy,precision,recall,f1_score,mse
Logistic Regression,1.0,1.0,1.0,1.0,0.0
Decision Tree,0.939038,0.968625,0.959022,0.963799,0.060962
Random Forest,0.953579,0.964286,0.981494,0.972814,0.046421
SVC,0.994407,0.993434,1.0,0.996706,0.005593
Stacking Classifier,1.0,1.0,1.0,1.0,0.0
Voting Classifier,1.0,1.0,1.0,1.0,0.0


In [24]:
import joblib

# Save the Stacking Classifier model
joblib.dump(stacking_model, 'stacking_model.pkl')

print("Stacking Classifier model saved as 'stacking_model.pkl'")

# Provide code to download the file
from google.colab import files
files.download('stacking_model.pkl')


Stacking Classifier model saved as 'stacking_model.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>