# Mental Health Detection from reddit posts using NLP and LogisticRegression

### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report

In [2]:
df=pd.read_csv('dreaddit-train.csv')

- In this data set we have 2838 columns and 116 number of rows

In [3]:
df.shape

(2840, 116)

In [4]:
df.columns

Index(['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label',
       'confidence', 'social_timestamp', 'social_karma', 'syntax_ari',
       ...
       'lex_dal_min_pleasantness', 'lex_dal_min_activation',
       'lex_dal_min_imagery', 'lex_dal_avg_activation', 'lex_dal_avg_imagery',
       'lex_dal_avg_pleasantness', 'social_upvote_ratio',
       'social_num_comments', 'syntax_fk_grade', 'sentiment'],
      dtype='object', length=116)

- we have 116 columns but for our analysis we need only two columns the post text and label which is 1 or 0

- Here 1 means the statement is stressful and 0 means the statement is not stressful

In [5]:
df = df[['text', 'label']]

In [6]:
df.head()

Unnamed: 0,text,label
0,"He said he had not felt that way before, sugge...",1
1,"Hey there r/assistance, Not sure if this is th...",0
2,My mom then hit me with the newspaper and it s...,1
3,"until i met my new boyfriend, he is amazing, h...",1
4,October is Domestic Violence Awareness Month a...,1


- Checked the null values in the dataset and we found 0 null values in our dataset.

In [7]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [8]:
df.duplicated().sum()

15

In [9]:
df=df.drop_duplicates()

- we found 17 duplicate rows in our dataset so we dropped all the 15 rows.

#### StopWords
- Stop words are common words in a language, like "the", "is", and "and", which are often removed during text processing tasks because they carry little semantic meaning. Removing stop words helps improve efficiency and reduce noise in natural language processing tasks.

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Load English stopwords
english_stopwords = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove digits and single characters
    text = re.sub(r'\b\w\b|\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text and remove stopwords
    words = [word for word in text.split() if word not in english_stopwords]
    
    # Stem the words
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    processed_text = ' '.join(stemmed_words)
    
    return processed_text

# Apply the preprocess_text function to the "text" column of your DataFrame
df["text"] = df["text"].apply(preprocess_text)


In [12]:
df['text'][1]

'hey assist sure right place post goe current student intern sandia nation lab work survey help improv market outreach effort mani school recruit around countri look current undergradgrad stem student your stem student know stem student would greatli appreci help take pass along short survey thank everyon help take survey enter draw chanc win one three amazon gc'

In [13]:
test_df=pd.read_csv('dreaddit-test.csv')

In [14]:
test_df=test_df[['text','label']]

In [15]:
test_df['text']=test_df['text'].apply(preprocess_text)

In [16]:
df['text']=df['text'].apply(preprocess_text)

### Splitting the data into training and testing

In [17]:
X_train=df['text']
X_test=test_df['text']
y_train=df['label']
y_test=test_df['label']

### Converting the words to numeric using tfidf

In [18]:
# Convert the Pandas Series X_train and X_test to sparse matrices using TfidfVectorizer
# vectorizer = TfidfVectorizer()
# X_train_sparse = vectorizer.fit_transform(X_train)
# X_test_sparse = vectorizer.transform(X_test)

# # Convert the sparse matrices to dense NumPy arrays
# X_train = X_train_sparse.toarray()
# X_test = X_test_sparse.toarray()

from sklearn.feature_extraction.text import CountVectorizer
max_features=2000

cou_vec=CountVectorizer(max_features=max_features, stop_words="english" , ngram_range=(1,2))

X_train_sparse=cou_vec.fit_transform(X_train)
X_test_sparse=cou_vec.transform(X_test)

X_train=X_train_sparse.toarray()
X_test=X_test_sparse.toarray()

### Model Training
- Logistic Regression
- KNN Classifier
- RandomForest Classifier
- DecisionTree Classifier
- Naive Bayes Classifier
- AdaBoost Classifier
- SVM Classifier

#### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

lr_clf=LogisticRegression(max_iter=200)
lr_clf.fit(X_train,y_train)

In [20]:
print('Training Score:',lr_clf.score(X_train,y_train))
print('Testing Score:',lr_clf.score(X_test,y_test))

Training Score: 0.9709734513274336
Testing Score: 0.6937062937062937


In [21]:
from sklearn.model_selection import cross_val_score
accuracies=cross_val_score(estimator=lr_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.7146831065333433
Standard Deviation:  0.02840477036220006


In [22]:
lr_pred=lr_clf.predict(X_test)
accuracy = accuracy_score(lr_pred,y_test)
precision = precision_score(lr_pred,y_test)
recall = recall_score(lr_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(lr_pred,y_test))

Accuracy: 69.37 %
Precision: 72.63 %
Recall: 69.43 %
              precision    recall  f1-score   support

           0       0.66      0.69      0.68       329
           1       0.73      0.69      0.71       386

    accuracy                           0.69       715
   macro avg       0.69      0.69      0.69       715
weighted avg       0.70      0.69      0.69       715



#### KNN Classifier

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)

In [24]:
print('Training Score:',knn_clf.score(X_train,y_train))
print('Testing Score:',knn_clf.score(X_test,y_test))

Training Score: 0.9150442477876106
Testing Score: 0.5370629370629371


In [25]:
accuracies=cross_val_score(estimator=knn_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.5529133147883617
Standard Deviation:  0.020949989414207445


In [26]:
knn_pred=knn_clf.predict(X_test)
accuracy = accuracy_score(knn_pred,y_test)
precision = precision_score(knn_pred,y_test)
recall = recall_score(knn_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(knn_pred,y_test))

Accuracy: 53.71 %
Precision: 20.05 %
Recall: 67.27 %
              precision    recall  f1-score   support

           0       0.90      0.51      0.65       605
           1       0.20      0.67      0.31       110

    accuracy                           0.54       715
   macro avg       0.55      0.59      0.48       715
weighted avg       0.79      0.54      0.60       715



#### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf_clf=RandomForestClassifier(n_estimators=50, random_state=42)
rf_clf.fit(X_train,y_train)

In [28]:
print('Training Score:',rf_clf.score(X_train,y_train))
print('Testing Score:',rf_clf.score(X_test,y_test))

Training Score: 0.9989380530973452
Testing Score: 0.7202797202797203


In [29]:
accuracies=cross_val_score(estimator=rf_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.7065471267824475
Standard Deviation:  0.0364333259883362


In [30]:
rf_pred=rf_clf.predict(X_test)
accuracy = accuracy_score(rf_pred,y_test)
precision = precision_score(rf_pred,y_test)
recall = recall_score(rf_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(rf_pred,y_test))

Accuracy: 72.03 %
Precision: 81.84 %
Recall: 69.43 %
              precision    recall  f1-score   support

           0       0.62      0.76      0.68       280
           1       0.82      0.69      0.75       435

    accuracy                           0.72       715
   macro avg       0.72      0.73      0.72       715
weighted avg       0.74      0.72      0.72       715



#### Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='entropy',max_depth=2)
tree_clf.fit(X_train,y_train)

In [32]:
print('Training Score:',tree_clf.score(X_train,y_train))
print('Testing Score:',tree_clf.score(X_test,y_test))

Training Score: 0.6194690265486725
Testing Score: 0.6027972027972028


In [33]:
accuracies=cross_val_score(estimator=tree_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.6183971129990227
Standard Deviation:  0.021084099934333304


In [34]:
tree_pred=tree_clf.predict(X_test)
accuracy = accuracy_score(tree_pred,y_test)
precision = precision_score(tree_pred,y_test)
recall = recall_score(tree_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(tree_pred,y_test))

Accuracy: 60.28 %
Precision: 46.61 %
Recall: 66.41 %
              precision    recall  f1-score   support

           0       0.75      0.57      0.65       456
           1       0.47      0.66      0.55       259

    accuracy                           0.60       715
   macro avg       0.61      0.62      0.60       715
weighted avg       0.65      0.60      0.61       715



#### Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

In [36]:
print('Training Score:',gnb_clf.score(X_train,y_train))
print('Testing Score:',gnb_clf.score(X_test,y_test))

Training Score: 0.8297345132743363
Testing Score: 0.6909090909090909


In [37]:
accuracies=cross_val_score(estimator=gnb_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.6725647194446533
Standard Deviation:  0.020670641006127106


In [38]:
gnb_pred=gnb_clf.predict(X_test)
accuracy = accuracy_score(gnb_pred,y_test)
precision = precision_score(gnb_pred,y_test)
recall = recall_score(gnb_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(gnb_pred,y_test))

Accuracy: 69.09 %
Precision: 65.58 %
Recall: 72.02 %
              precision    recall  f1-score   support

           0       0.73      0.66      0.70       379
           1       0.66      0.72      0.69       336

    accuracy                           0.69       715
   macro avg       0.69      0.69      0.69       715
weighted avg       0.69      0.69      0.69       715



#### Adaboost Classifier 

In [39]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf=AdaBoostClassifier()
ab_clf.fit(X_train,y_train)



In [40]:
print('Training Score:',ab_clf.score(X_train,y_train))
print('Testing Score:',ab_clf.score(X_test,y_test))

Training Score: 0.7288495575221239
Testing Score: 0.6965034965034965


In [41]:
accuracies=cross_val_score(estimator=ab_clf,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())



Mean Accuracy:  0.6814111720923239
Standard Deviation:  0.02251875375088833


In [42]:
ab_pred=ab_clf.predict(X_test)
accuracy = accuracy_score(ab_pred,y_test)
precision = precision_score(ab_pred,y_test)
recall = recall_score(ab_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(ab_pred,y_test))

Accuracy: 69.65 %
Precision: 71.82 %
Recall: 70.11 %
              precision    recall  f1-score   support

           0       0.67      0.69      0.68       337
           1       0.72      0.70      0.71       378

    accuracy                           0.70       715
   macro avg       0.70      0.70      0.70       715
weighted avg       0.70      0.70      0.70       715



#### SVM

In [43]:
from sklearn.svm import SVC
svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
svm.fit(X_train, y_train)

In [44]:
print('Training Score:',svm.score(X_train,y_train))
print('Testing Score:',svm.score(X_test,y_test))

Training Score: 0.9989380530973452
Testing Score: 0.5146853146853146


In [45]:
accuracies=cross_val_score(estimator=svm,X=X_train,y=y_train,cv=10)
print("Mean Accuracy: ",accuracies.mean())
print("Standard Deviation: ",accuracies.std())

Mean Accuracy:  0.5316755632408591
Standard Deviation:  0.006894642636963321


In [46]:
svm_pred=svm.predict(X_test)
accuracy = accuracy_score(svm_pred,y_test)
precision = precision_score(svm_pred,y_test)
recall = recall_score(svm_pred,y_test)

print("Accuracy:", round(accuracy*100,2),'%')
print("Precision:", round(precision*100,2),'%')
print("Recall:", round(recall*100,2),'%')
print(classification_report(svm_pred,y_test))

Accuracy: 51.47 %
Precision: 99.73 %
Recall: 51.54 %
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      0.52      0.68       714

    accuracy                           0.51       715
   macro avg       0.50      0.26      0.34       715
weighted avg       1.00      0.51      0.68       715



# __________________________________________________

#### We got the accuracies of differt models above in which LogisticRegreesion has the highest accuray with - 73.99%. Below we will check the folloing models to test individual input 

In [47]:
prompt = input()
prompt = cou_vec.transform([prompt]).toarray()

# Dictionary to store predictions along with corresponding classifier names
classifier_names = {
    "Logistic Regression": lr_clf,
    "K-Nearest Neighbors": knn_clf,
    "Random Forest": rf_clf,
    "Decision Tree": tree_clf,
    "Gaussian Naive Bayes": gnb_clf,
    "AdaBoost": ab_clf
}

pred_dict = {}

# Loop through each classifier and make predictions
for name, clf in classifier_names.items():
    pred = clf.predict(prompt)[0]
    if pred == 1:
        pred_dict[name] = "Stressed"
    else:
        pred_dict[name] = "Not Stressed"

# Print predictions
for clf_name, pred_label in pred_dict.items():
    print(f"{clf_name}: {pred_label}\n")


happy
Logistic Regression: Not Stressed

K-Nearest Neighbors: Not Stressed

Random Forest: Not Stressed

Decision Tree: Not Stressed

Gaussian Naive Bayes: Not Stressed

AdaBoost: Not Stressed



In [48]:
import pickle
pickle.dump(rf_clf,open('clf.pkl','wb'))
pickle.dump(cou_vec,open('cv.pkl','wb'))