# 🔵 Jupyter Notebook: Comparing Supervised and Unsupervised Methods for Depression Cause Classification

# 1. Introduction
# Classifying causes of depression (General Depression, Stress, Domestic Violence, Gender Inequality, Ambiguous)
# Comparing Supervised (SVM, XGBoost+BERT) vs Unsupervised (KMeans)

# 2. Data Loading and Preprocessing


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

In [2]:
train_df = pd.read_csv(r"D:\unitec\MachineLearningCourse\Thesis_code\final_psychiatric_dataset_with_dv.csv")

validation_df = pd.read_csv(r"D:\unitec\MachineLearningCourse\Thesis_code\ManuallyAnnotated_data.csv")

# Preprocessing
train_df.dropna(subset=['tweet', 'Class'], inplace=True)
validation_df.dropna(subset=['tweet', 'Class'], inplace=True)

# Encode Labels
le = LabelEncoder()
train_df['Class_encoded'] = le.fit_transform(train_df['Class'])
validation_df['Class_encoded'] = le.transform(validation_df['Class'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(train_df['tweet'], train_df['Class_encoded'], test_size=0.25, stratify=train_df['Class_encoded'], random_state=42)

In [None]:
# 3. Supervised Models

# TF-IDF + SVM
from sklearn.svm import SVC

vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm = SVC(kernel='linear', probability=True)
svm.fit(X_train_tfidf, y_train)

y_pred_svm = svm.predict(X_test_tfidf)

# Evaluate SVM
print("SVM Results:")
print(classification_report(y_test, y_pred_svm, target_names=le.classes_))

# BERT Embeddings + XGBoost
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_bert = bert_model.encode(X_train.tolist(), show_progress_bar=True)
X_test_bert = bert_model.encode(X_test.tolist(), show_progress_bar=True)

# Balance classes
smote = SMOTE(random_state=42)
X_train_bert_res, y_train_res = smote.fit_resample(X_train_bert, y_train)

xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_bert_res, y_train_res)

y_pred_xgb = xgb.predict(X_test_bert)

# Evaluate XGBoost+BERT
print("XGBoost+BERT Results:")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))


 4. Unsupervised Model (KMeans)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score

# Clustering with TF-IDF features
kmeans = KMeans(n_clusters=len(le.classes_), random_state=42)
kmeans.fit(X_test_tfidf)
clusters = kmeans.predict(X_test_tfidf)

# Evaluate clustering
print("Unsupervised Clustering Results:")
print(f"Adjusted Rand Index: {adjusted_rand_score(y_test, clusters):.2f}")
print(f"Normalized Mutual Information: {normalized_mutual_info_score(y_test, clusters):.2f}")
print(f"Silhouette Score: {silhouette_score(X_test_tfidf, clusters):.2f}")

# 5. Validation Set Evaluation
X_val_bert = bert_model.encode(validation_df['tweet'].tolist(), show_progress_bar=True)

val_preds_xgb = xgb.predict(X_val_bert)

# Evaluate on Validation Set
print("Validation Set Evaluation (XGBoost+BERT):")
print(classification_report(validation_df['Class_encoded'], val_preds_xgb, target_names=le.classes_))

# 6. Comparison Summary
from tabulate import tabulate
import matplotlib.pyplot as plt

# Dummy values, to be filled after running evaluations
comparison_data = {
    'Model': ['SVM', 'XGBoost+BERT', 'KMeans Clustering'],
    'Accuracy': [0.0, 0.0, None],
    'Precision': [0.0, 0.0, None],
    'Recall': [0.0, 0.0, None],
    'F1-score': [0.0, 0.0, None],
    'ARI': [None, None, 0.0],
    'NMI': [None, None, 0.0]
}