In [None]:
import pandas as pd
import numpy as np
import torch
import warnings
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import torch.nn.functional as F

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load dataset
df = pd.read_csv('postpartum_depression_text_data_v2.csv')

# Selecting text feature and target variable
text_column = 'Message'
target_column = 'Postpartum_Depression_Score'

# Handle missing values
df[text_column] = df[text_column].fillna('')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[text_column], df[target_column], test_size=0.2, random_state=42)

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Force GPU usage if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom dataset class
class PPDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            "input_ids": encoding['input_ids'].squeeze().to(device),
            "attention_mask": encoding['attention_mask'].squeeze().to(device),
            "labels": torch.tensor(label, dtype=torch.long).to(device)
        }

# Create dataset objects
train_dataset = PPDataset(X_train, y_train, tokenizer)
test_dataset = PPDataset(X_test, y_test, tokenizer)

# DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch]).to(device)
    attention_mask = torch.stack([item['attention_mask'] for item in batch]).to(device)
    labels = torch.stack([item['labels'] for item in batch]).to(device)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

# Optimizer & Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training Loop with Progress Bar
for epoch in range(2):  # Change to more epochs if needed
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss / len(train_loader))
    print(f"Epoch {epoch+1} completed. Average Loss: {total_loss / len(train_loader)}")

# Save trained model
torch.save(model.state_dict(), "distilbert_ppd_model.pth")
print("BERT model saved as distilbert_ppd_model.pth")

# Evaluation with Progress Bar
model.eval()
y_preds, y_trues = [], []
progress_bar = tqdm(test_loader, desc="Evaluating")
with torch.no_grad():
    for batch in progress_bar:
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        logits = outputs.logits
        y_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        y_trues.extend(batch['labels'].cpu().numpy())

print("Accuracy:", accuracy_score(y_trues, y_preds))
print("Classification Report:\n", classification_report(y_trues, y_preds))

# Prediction Function
def predict_text(text):
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=32, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
    return "High Risk" if torch.argmax(probs).item() == 1 else "Low Risk"

# Example Prediction
sample_text = "I am scared"
print(f"Prediction for '{sample_text}': {predict_text(sample_text)}")


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 100/100 [02:51<00:00,  1.72s/it, loss=0.112]


Epoch 1 completed. Average Loss: 0.11191637723706663


Epoch 2: 100%|██████████| 100/100 [02:47<00:00,  1.67s/it, loss=0.0022] 


Epoch 2 completed. Average Loss: 0.0021976129850372673
BERT model saved as distilbert_ppd_model.pth


Evaluating: 100%|██████████| 25/25 [00:05<00:00,  4.17it/s]

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       102

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Prediction for 'I feel exhausted and cry every night.': High Risk





In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib  # For saving the trained model

# Load structured dataset (Replace 'your_dataset.csv' with actual file path)
df = pd.read_csv('postpartum_depression_data_expanded.csv')

# Selecting structured features (including new features)
all_possible_features = ['Age', 'BMI_Before_Pregnancy', 'Gestational_Age_at_Delivery',
                         'Pregnancy_Complications', 'Employment_Status', 'Household_Income',
                         'Family_History_of_Depression', 'Sleep_Disruptions_per_Night',
                         'Support_from_Partner', 'Mode_of_Delivery', 'Diet_Quality', 'Postpartum_Pain_Severity']

# Only select columns that exist in the dataset
structured_features = [col for col in all_possible_features if col in df.columns]
print("Using structured features:", structured_features)

# Target variable
target = 'Postpartum_Depression_Score'
if target not in df.columns:
    raise KeyError(f"Target variable '{target}' not found in dataset")

# Encode categorical variables
label_encoders = {}
for col in df[structured_features].select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future decoding

# Split data
X_train, X_test, y_train, y_test = train_test_split(df[structured_features], df[target], test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Save trained structured model
joblib.dump(xgb_model, "structured_xgboost.pkl")
print("Structured model saved as structured_xgboost.pkl")

# Evaluate model
y_pred = xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Using structured features: ['Age', 'BMI_Before_Pregnancy', 'Pregnancy_Complications', 'Employment_Status', 'Household_Income', 'Family_History_of_Depression', 'Sleep_Disruptions_per_Night', 'Support_from_Partner', 'Mode_of_Delivery', 'Diet_Quality', 'Postpartum_Pain_Severity']
Structured model saved as structured_xgboost.pkl
Accuracy: 0.8915
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93      1410
           1       0.99      0.64      0.78       590

    accuracy                           0.89      2000
   macro avg       0.93      0.82      0.85      2000
weighted avg       0.91      0.89      0.88      2000



In [None]:
sample_text = "Life is getting better"
print(f"Prediction for '{sample_text}': {predict_text(sample_text)}")


Prediction for 'Life is getting better': Low Risk


In [17]:
import pandas as pd
import numpy as np
import torch
import joblib
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch.nn.functional as F

# Load trained structured model (XGBoost)
structured_model = joblib.load("structured_xgboost.pkl")

# Check expected number of features
expected_features = structured_model.n_features_in_
print(f"Structured model expects {expected_features} features.")

# Load trained NLP model (BERT)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.load_state_dict(torch.load("distilbert_ppd_model.pth", map_location=device))
model.to(device)
model.eval()

# Load BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Prediction function for NLP model
def get_nlp_prediction(text):
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=32, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
    return probs[:, 1].item()  # Probability of postpartum depression

# Prediction function for structured model
def get_structured_prediction(features):
    if len(features) != expected_features:
        raise ValueError(f"Feature shape mismatch. Expected {expected_features}, got {len(features)}")
    prob = structured_model.predict_proba([features])[:, 1]  # Probability of postpartum depression
    return prob[0]

# Function to combine both models' predictions
def combined_prediction(structured_features, text, w_nlp=0.6, w_struct=0.4):
    prob_nlp = get_nlp_prediction(text)
    prob_structured = get_structured_prediction(structured_features)
    final_score = (w_nlp * prob_nlp) + (w_struct * prob_structured)
    return 1 if final_score > 0.5 else 0, final_score

# Example Usage
sample_text = "I feel exhausted and cry every night."

# Structured features (Ensure it matches expected feature count)
# [Age, BMI Before Pregnancy, Gestational Age at Delivery, Pregnancy Complications, 
#  Employment Status, Household Income, Family History of Depression, Sleep Disruptions per Night, 
#  Support from Partner, Mode of Delivery, Diet Quality, Postpartum Pain Severity]
sample_structured_features = [25, 24.5, 0, 1, 2, 0, 3, 1, 1, 4, 2]  # Adjusted order to match dataset

prediction, score = combined_prediction(sample_structured_features, sample_text)
print("Final Prediction:", "High Risk" if prediction == 1 else "Low Risk", "| Score:", score)


Structured model expects 11 features.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final Prediction: High Risk | Score: 0.9849624395370484


In [14]:
xgb_model.feature_names = structured_features  # Store feature names before saving
joblib.dump(xgb_model, "structured_xgboost.pkl")


['structured_xgboost.pkl']

In [15]:
# Load trained structured model (XGBoost)
structured_model = joblib.load("structured_xgboost.pkl")

# Check expected number of features
expected_features = getattr(structured_model, "n_features_in_", len(sample_structured_features))  # Fallback if missing
print(f"Structured model expects {expected_features} features.")

# Print feature names safely (if available)
if hasattr(structured_model, "feature_names"):
    print("Structured model feature names:", structured_model.feature_names)


Structured model expects 11 features.
Structured model feature names: ['Age', 'BMI_Before_Pregnancy', 'Pregnancy_Complications', 'Employment_Status', 'Household_Income', 'Family_History_of_Depression', 'Sleep_Disruptions_per_Night', 'Support_from_Partner', 'Mode_of_Delivery', 'Diet_Quality', 'Postpartum_Pain_Severity']
