In [None]:
#Importing necessary libraries
import pandas as pd

In [None]:
# Loading the dataset
df = pd.read_parquet("Big_preprocessed_filter_data.parquet")
df.head(10)

Unnamed: 0,subreddit,label,preprocessed_posts
0,depression,0,posting since one helped last time yesterday s...
1,depression,0,hey everyone mobile pardon type happen id like...
2,depression,0,alone feel isolated wife kiss parent shilling ...
3,depression,0,mentalhealth reason take school year since one...
4,depression,0,tired getting overlooked girl tired tell relat...
5,depression,0,parent got fight today tried telling best frie...
6,depression,0,hi pad student hyderabad india pad research fo...
7,depression,0,hate fucking hate stare mirror see human emoti...
8,depression,0,rousing much right time laugh say next fail es...
9,depression,0,therapist told dont really know mean wait week...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   subreddit           20000 non-null  object
 1   label               20000 non-null  int64 
 2   preprocessed_posts  20000 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.9+ KB


In [None]:
from sklearn.model_selection import train_test_split

# Divide dataset into training and testing sets (80% training, 20% testing)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training data size: {train_data.shape}")
print(f"Testing data size: {test_data.shape}")



Training data size: (16000, 3)
Testing data size: (4000, 3)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on training data and transform both training and test data
X_train_tfidf = tfidf.fit_transform(train_data['preprocessed_posts'])
X_test_tfidf = tfidf.transform(test_data['preprocessed_posts'])

# Convert labels into numeric form
y_train = train_data['subreddit']
y_test = test_data['subreddit']

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Vectorizing the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['preprocessed_posts']).toarray()
X_test = vectorizer.transform(test_data['preprocessed_posts']).toarray()

# SVM Model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred, zero_division=1))

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred, zero_division=1))

SVM Accuracy: 0.85375
SVM Classification Report:
               precision    recall  f1-score   support

     Anxiety       0.87      0.83      0.85      1981
  depression       0.84      0.88      0.86      2019

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

Random Forest Accuracy: 0.85625
Random Forest Classification Report:
               precision    recall  f1-score   support

     Anxiety       0.87      0.83      0.85      1981
  depression       0.84      0.88      0.86      2019

    accuracy                           0.86      4000
   macro avg       0.86      0.86      0.86      4000
weighted avg       0.86      0.86      0.86      4000



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit the encoder on the training data and transform both training and test data
y_train = le.fit_transform(train_data['subreddit'])
y_test = le.transform(test_data['subreddit'])
xgb = XGBClassifier()
xgb.fit(X_train_tfidf, y_train)
y_pred_xgb = xgb.predict(X_test_tfidf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")

XGBoost Accuracy: 0.8500


In [None]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Initialize results list
results = []

# Initialize the LabelEncoder (if not already done)
le = LabelEncoder()

# Assuming 'Subreddit' is the target column with categories like 'Anxiety', 'depression'
le.fit(train_data['subreddit'])  # Fit on training data to get all categories

# 1. SVM Evaluation
svm_pred_encoded = le.transform(svm_pred)  # Encode SVM predictions
svm_accuracy = accuracy_score(y_test, svm_pred_encoded)
svm_f1 = f1_score(y_test, svm_pred_encoded, average='weighted')
results.append({'Model': 'SVM', 'Accuracy': svm_accuracy, 'F1-Score': svm_f1})

# 2. Random Forest Evaluation
rf_pred_encoded = le.transform(rf_pred)  # Encode Random Forest predictions
rf_accuracy = accuracy_score(y_test, rf_pred_encoded)
rf_f1 = f1_score(y_test, rf_pred_encoded, average='weighted')
results.append({'Model': 'Random Forest', 'Accuracy': rf_accuracy, 'F1-Score': rf_f1})

# 3. XGBoost Evaluation
# XGBoost predictions are likely already numerical (since you used LabelEncoder for training)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb, average='weighted')
results.append({'Model': 'XGBoost', 'Accuracy': xgb_accuracy, 'F1-Score': xgb_f1})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

        Model  Accuracy  F1-Score
          SVM   0.85375  0.853654
Random Forest   0.85625  0.856160
      XGBoost   0.85000  0.849702
