In [16]:
import pandas as pd


In [35]:
df = pd.read_csv("/home/redleaf/Documents/DUK/NLP/NLP_PROJECT_DEPRESSION/dev_with_labels.tsv", delimiter='\t')
df = df.rename(columns={'Text data':'Review'}, inplace=False)
df.head()

Unnamed: 0,PID,Review,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        4496 non-null   object
 1   Text data  4496 non-null   object
 2   Label      4496 non-null   object
dtypes: object(3)
memory usage: 105.5+ KB


In [14]:
df.isna().sum(0)

PID          0
Text data    0
Label        0
dtype: int64

In [18]:
df['Label'].value_counts()

Label
moderate          2306
not depression    1830
severe             360
Name: count, dtype: int64

### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

X = df.Review
Y = df.Label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=1)

# TF-IDF

## Logistic Regression

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model_pipeline_lr = Pipeline([('tfidf',TfidfVectorizer()), ('lr',LogisticRegression())])

#trian the model
model_pipeline_lr.fit(X_train, Y_train)

#Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy_lr = accuracy_score(Y_test, y_pred_lr)
precision_lr = precision_score(Y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(Y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(Y_test,y_pred_lr, average='weighted')


In [25]:
classification_report_lr = classification_report(Y_test,y_pred_lr)

print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.61      0.78      0.69       455
not depression       0.64      0.55      0.60       363
        severe       0.71      0.06      0.11        82

      accuracy                           0.62       900
     macro avg       0.66      0.47      0.47       900
  weighted avg       0.63      0.62      0.60       900



## SVM

In [27]:
from sklearn.svm import SVC

model_pipeline_svm = Pipeline([('tfidf',TfidfVectorizer()), ('svm',SVC())])

#trian the model
model_pipeline_svm.fit(X_train, Y_train)

#Predict on the test set
y_pred_svm = model_pipeline_svm.predict(X_test)

In [28]:
classification_report_svm = classification_report(Y_test,y_pred_svm)

print("Evaluation Metrics for SVM Model")
print("------------------------------------------------")
print(classification_report_svm)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.61      0.80      0.69       455
not depression       0.66      0.54      0.59       363
        severe       0.60      0.04      0.07        82

      accuracy                           0.62       900
     macro avg       0.62      0.46      0.45       900
  weighted avg       0.63      0.62      0.59       900



## Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier

model_pipeline_dt = Pipeline([('tfidf', TfidfVectorizer()),('dt', DecisionTreeClassifier())])

# Train the model
model_pipeline_dt.fit(X_train, Y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

In [31]:
classification_report_dt = classification_report(Y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------------")
print(classification_report_dt)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.57      0.59      0.58       455
not depression       0.53      0.53      0.53       363
        severe       0.26      0.20      0.22        82

      accuracy                           0.53       900
     macro avg       0.45      0.44      0.45       900
  weighted avg       0.53      0.53      0.53       900



## Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

model_pipeline_rf = Pipeline([('tfidf', TfidfVectorizer()),('rf', RandomForestClassifier())])

# Train the model
model_pipeline_rf.fit(X_train, Y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

In [33]:
classification_report_rf = classification_report(Y_test, y_pred_rf)

# Print evaluation metrics
print("Evaluation Metrics for Random Forest Model")
print("------------------------------------------------")
print(classification_report_rf)

Evaluation Metrics for Random Forest Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.78      0.67       455
not depression       0.63      0.52      0.57       363
        severe       0.50      0.01      0.02        82

      accuracy                           0.60       900
     macro avg       0.57      0.44      0.42       900
  weighted avg       0.60      0.60      0.57       900

