In [1]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Mental Health Sentiment Analysis using BERT
# Complete training pipeline for detecting anxiety, depression, and stress

import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Load and explore the dataset
data_path = r"C:\Users\NAMAN\Documents\GitHub\Prototype-\Combined Data.csv"  # Update this path
df = pd.read_csv(data_path)

print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nClass distribution:")
print(df['status'].value_counts())  # Your label column is 'status'

# 2. Data preprocessing and cleaning
def clean_text(text):
    """Basic text cleaning function"""
    if pd.isna(text):
        return ""
    # Convert to string and basic cleaning
    text = str(text)
    text = text.lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Based on your dataset structure
text_column = 'statement'  # Your text column
label_column = 'status'   # Your label column

# Clean the text data
df[text_column] = df[text_column].apply(clean_text)

# Remove any rows with empty text
df = df[df[text_column].str.len() > 0].reset_index(drop=True)

print(f"\nAfter cleaning - Dataset shape: {df.shape}")
print("Label distribution after cleaning:")
print(df[label_column].value_counts())

# 3. Label encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df[label_column])

print(f"\nLabel mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

num_labels = len(label_encoder.classes_)
print(f"\nNumber of classes: {num_labels}")

# 4. Split the dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df[text_column].tolist(), 
    df['encoded_label'].tolist(), 
    test_size=0.3, 
    random_state=42, 
    stratify=df['encoded_label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, 
    temp_labels, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_labels
)

print(f"\nDataset splits:")
print(f"Train: {len(train_texts)} samples")
print(f"Validation: {len(val_texts)} samples")
print(f"Test: {len(test_texts)} samples")

# 5. Initialize tokenizer from local path
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\NAMAN\Documents\GitHub\Prototype-\bert-base-uncased")

# 6. Tokenization function with adjusted max_length for mental health text
def tokenize_texts(texts, labels, max_length=64):  # Reduced from 128 to 64 like your dialogue model
    """Tokenize texts and return input_ids, attention_mask, and labels"""
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels, dtype=torch.long)
    }

# 7. Tokenize all splits
print("\nTokenizing datasets...")
train_encodings = tokenize_texts(train_texts, train_labels)
val_encodings = tokenize_texts(val_texts, val_labels)
test_encodings = tokenize_texts(test_texts, test_labels)

# 8. Create dataset class
class MentalHealthDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['labels'][idx]
        }

    def __len__(self):
        return len(self.encodings['labels'])

# 9. Create datasets and dataloaders
train_dataset = MentalHealthDataset(train_encodings)
val_dataset = MentalHealthDataset(val_encodings)
test_dataset = MentalHealthDataset(test_encodings)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"DataLoaders created with batch size: {batch_size}")

# 10. Initialize model from local path
model = BertForSequenceClassification.from_pretrained(
    r"C:\Users\NAMAN\Documents\GitHub\Prototype-\bert-base-uncased",
    num_labels=num_labels
)
model.to(device)

print(f"Model loaded with {num_labels} output classes")

# 11. Setup optimizer and scheduler
num_epochs = 3
learning_rate = 2e-5

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * num_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(f"Training setup - Epochs: {num_epochs}, Learning rate: {learning_rate}")

# 12. Training functions
def train_epoch(model, train_loader, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    return total_loss / len(train_loader)

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(val_loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    
    return avg_loss, accuracy, all_predictions, all_labels

# 13. Training loop
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50)

best_accuracy = 0
best_model_state = None
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 30)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
    train_losses.append(train_loss)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_accuracy, val_predictions, val_labels = evaluate(model, val_loader, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Save best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model_state = model.state_dict().copy()
        print(f"✅ New best model saved with accuracy: {best_accuracy:.4f}")

# 14. Load best model and final evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\n📥 Loaded best model with validation accuracy: {best_accuracy:.4f}")

print("\n" + "="*50)
print("FINAL EVALUATION ON TEST SET")
print("="*50)

test_loss, test_accuracy, test_predictions, test_labels = evaluate(model, test_loader, device)

print(f"\nFinal Test Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# 15. Detailed classification report
print("\n" + "="*50)
print("DETAILED CLASSIFICATION REPORT")
print("="*50)
target_names = label_encoder.classes_
print(classification_report(test_labels, test_predictions, target_names=target_names))

# 16. Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Mental Health Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

# 17. Training curves
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(val_accuracies, label='Validation Accuracy', color='green')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 3)
class_counts = pd.Series(test_labels).value_counts().sort_index()
plt.bar(range(len(class_counts)), class_counts.values)
plt.title('Test Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(range(len(target_names)), target_names, rotation=45)

plt.tight_layout()
plt.show()

# 18. Save the trained model
model_save_path = r"C:\Users\NAMAN\Documents\GitHub\Prototype-\trained_mental_health_bert"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save label encoder
import pickle
with open(model_save_path + "/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# Save training history
training_history = {
    'train_losses': train_losses,
    'val_losses': val_losses,
    'val_accuracies': val_accuracies,
    'best_accuracy': best_accuracy,
    'test_accuracy': test_accuracy,
    'label_classes': label_encoder.classes_.tolist()
}

with open(model_save_path + "/training_history.pkl", "wb") as f:
    pickle.dump(training_history, f)

print(f"\n✅ Model saved to: {model_save_path}")
print(f"📊 Training history saved")
print(f"🏷️ Label encoder saved")

# 19. Inference function for future use
def predict_mental_health_state(text, model, tokenizer, label_encoder, device, max_length=64):  # Reduced from 128 to 64
    """
    Predict mental health state for a given text
    """
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    # Decode prediction
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'predicted_label': predicted_label,
        'confidence': confidence,
        'all_probabilities': {label: prob.item() for label, prob in zip(label_encoder.classes_, predictions[0])}
    }

print("\n" + "="*50)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("="*50)
print(f"🎯 Best Validation Accuracy: {best_accuracy:.4f}")
print(f"🧪 Final Test Accuracy: {test_accuracy:.4f}")
print(f"💾 Model saved for inference")

# Example usage of the inference function
print("\n" + "="*30)
print("EXAMPLE PREDICTIONS")
print("="*30)

example_texts = [
    "I feel so overwhelmed and anxious about everything",
    "Life is good, I'm feeling positive today",
    "I can't sleep, everything feels hopeless",
    "I'm stressed about work deadlines"
]

for text in example_texts:
    result = predict_mental_health_state(text, model, tokenizer, label_encoder, device)
    print(f"\nText: '{text}'")
    print(f"Prediction: {result['predicted_label']} (Confidence: {result['confidence']:.3f})")

Using device: cpu
Dataset Info:
Shape: (53043, 3)
Columns: ['Unnamed: 0', 'statement', 'status']

First few rows:
   Unnamed: 0                                          statement   status
0           0                                         oh my gosh  Anxiety
1           1  trouble sleeping, confused mind, restless hear...  Anxiety
2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3           3  I've shifted my focus to something else but I'...  Anxiety
4           4  I'm restless and restless, it's been a month n...  Anxiety

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None

Class distribution:
status
Normal                  16351
Depres

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:\Users\NAMAN\Documents\GitHub\Prototype-\bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 7 output classes
Training setup - Epochs: 3, Learning rate: 2e-05

STARTING TRAINING

Epoch 1/3
------------------------------


Training:  13%|█▎        | 302/2305 [1:16:47<8:29:17, 15.26s/it, loss=0.7243] 


KeyboardInterrupt: 