 Import Library and Load Data Set

In [None]:
import pandas as pd
import re

df = pd.read_csv("mental_health_cleaned.csv")

Split data for training; 80% for training, 20% for testing

In [None]:

X = df["text"] #text inputs that the model sees
y = df["status"] # labels model must predict

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, #80% train, 20% test
    stratify=y, #keep same distribution of classes in train and test
    random_state=42 #for reproducibility
)

# TD-IDF

I removed lemmatization cell

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    analyzer="word",
    stop_words="english",
    ngram_range=(1, 2),
    max_features=8000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(X_train_vec.shape)  # (num_train_samples, num_features)
df["status"].value_counts()

# Logistic Regression: One vs Rest

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report


# Base binary classifier used per class (one-vs-rest)
base_lr = LogisticRegression(
    solver="liblinear",
    max_iter=5000,
    random_state=42,
    class_weight="balanced"  
)

clf_ovr = OneVsRestClassifier(base_lr)

# Train
clf_ovr.fit(X_train_vec, y_train)

# Predict
pred_ovr = clf_ovr.predict(X_test_vec)

print(classification_report(y_test, pred_ovr))



# Model examples...

In [None]:
def predict_text(text):
    X = vectorizer.transform([text])
    pred = clf_ovr.predict(X)[0]
    return pred


print(predict_text("i feel fine today just checking in"))
print(predict_text("i am anxious and stressed and cannot sleep"))
print(predict_text("i want to die i can't do this anymore"))
print(predict_text("i am very sad and tired all the time"))
print(predict_text("i fear the dog, but i just ate, i cant believe his happiness"))
print(predict_text("i feel empty and lost, like nothing matters anymore"))
print(predict_text("I cant breath and I feel sweaty like I am going to pass out and my heart is racing"))
print(predict_text("i want like end my life and i am so angry i could hit a wall"))
print(predict_text("i am very afraid but super happy about my new job"))
print(predict_text("i am tanka jahari and I would never buy a whole pizza for myself"))
print(predict_text("I feel hopeless and worthless, like nothing will ever get better"))
print(predict_text("bye hope to see you never"))
print(predict_text("i miss my dogs!!"))


# Top words for each category found in user posts

In [None]:
import numpy as np

feature_names = np.array(vectorizer.get_feature_names_out())
TOP_N = 15

print("===== TOP WORDS PER CLASS (OvR Logistic Regression) =====")

for cls, estimator in zip(clf_ovr.classes_, clf_ovr.estimators_):
    coefs = estimator.coef_.ravel()   # binary classifier weights
    top_idx = np.argsort(coefs)[-TOP_N:][::-1]

    print(f"\n=== {cls} ===")
    for j in top_idx:
        print(f"  {feature_names[j]:<25} {coefs[j]:.4f}")


# Multinomial Logistic Regression


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


clf = LogisticRegression(
    solver="lbfgs",
    multi_class="multinomial",
    class_weight="balanced",
    max_iter=5000
)
clf.fit(X_train_vec, y_train)
pred = clf.predict(X_test_vec)
print(classification_report(y_test, pred, target_names=all_target_names))


Top words for Multinomial Logistic Regression


Analysis: What words were commonly overlapped between different classes?

