# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
from google.colab import drive
import nltk
from nltk.corpus import stopwords
import re
from imblearn.over_sampling import RandomOverSampler


# Mount Google Drive and Download NLTK Stopwords

In [None]:
drive.mount('/content/drive')
nltk.download('stopwords')


# Load and Preprocess the Dataset 

In [None]:
# Load and preprocess the dataset
df = pd.read_csv('/content/drive/MyDrive/NLP Mental Health Detector/Datasets/data_to_be_cleansed.csv')
df = df.dropna(subset=['text'])


# Define Text Cleaning Function

In [None]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['text'] = df['text'].apply(clean_text)


# Label Mapping and Data Splitting

In [None]:
label_map = {0: 'stress', 1: 'depression', 2: 'bipolar disorder', 3: 'personality disorder', 4: 'anxiety'}
df['target'] = df['target'].astype(int)

# Split the data
X = df['text'].tolist()
y = df['target'].tolist()


# Handling Class Imbalance with Oversampling 

In [None]:
# Oversampling to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(np.array(X).reshape(-1, 1), y)
X_resampled = X_resampled.flatten().tolist()


# Initialize RoBERTa Tokenizer 

In [None]:
# Initialize the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


# Define the Dataset Class

In [None]:
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] if self.texts[idx] is not None else ""
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Define the Training Function

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)


# Define Evaluation Function

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    return predictions, true_labels


# Implement K-Fold Cross Validation

In [None]:
# Implement k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_resampled, y_resampled), 1):
    print(f"Fold {fold}")

    train_texts = [X_resampled[i] for i in train_idx]
    train_labels = [y_resampled[i] for i in train_idx]
    val_texts = [X_resampled[i] for i in val_idx]
    val_labels = [y_resampled[i] for i in val_idx]

    train_dataset = MentalHealthDataset(train_texts, train_labels, tokenizer)
    val_dataset = MentalHealthDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
    model = model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 3
    num_training_steps = num_epochs * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train(model, train_loader, optimizer, scheduler, device)
        print(f"Training loss: {train_loss:.4f}")

        val_predictions, val_true_labels = evaluate(model, val_loader, device)

        print("\nValidation Classification Report:")
        print(classification_report(val_true_labels, val_predictions, target_names=[label_map[i] for i in range(5)]))

        print("\nValidation Confusion Matrix:")
        print(confusion_matrix(val_true_labels, val_predictions))
        print("\n" + "="*50 + "\n")

    # Save the model for each fold
    torch.save(model.state_dict(), f'mental_health_roberta_model_fold_{fold}.pth')


# Final Evaluation on a Separate Test Set

In [None]:
# Final evaluation on a separate test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
test_dataset = MentalHealthDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the best model (you may need to choose the best fold based on validation results)
best_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
best_model.load_state_dict(torch.load('mental_health_roberta_model_fold_1.pth'))
best_model = best_model.to(device)

test_predictions, test_true_labels = evaluate(best_model, test_loader, device)
print("\nTest Classification Report:")
print(classification_report(test_true_labels, test_predictions, target_names=[label_map[i] for i in range(5)]))

print("\nTest Confusion Matrix:")
print(confusion_matrix(test_true_labels, test_predictions))
