In [1]:
import pandas as pd

df = pd.read_csv("/home/redleaf/Documents/DUK/NLP/NLP_PROJECT_DEPRESSION/dev_with_labels.tsv", delimiter='\t')
df = df.rename(columns={'Text data':'Review'}, inplace=False)
df.head()

Unnamed: 0,PID,Review,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PID     4496 non-null   object
 1   Review  4496 non-null   object
 2   Label   4496 non-null   object
dtypes: object(3)
memory usage: 105.5+ KB


In [5]:
df.isna().sum(0)

PID       0
Review    0
Label     0
dtype: int64

In [6]:
df['Label'].value_counts()

Label
moderate          2306
not depression    1830
severe             360
Name: count, dtype: int64

### Train Test Split

In [2]:
from sklearn.model_selection import train_test_split

X = df.Review
Y = df.Label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=1)

In [5]:
print(X_train[0])

I enjoyed today, and I still am! Tomorrows depression can wait! : Today, I'm warm and cozy :) I even had some tea, to complement my sandwich!
This is a crazy time we live in, but it's only the beginning, with the world going to shambles, being alone feels werid.
But a stranger came through and saved me,
Of course it's a debt I'll have to repay, but I'm grateful to have this debt!
Crazy what a few dollars can do to change someones situation.
But these days, it's all about here and now!
I don't have the ability to relax just because, I'm full, clean, and warm.
Tomorrow it's back to reality! 
The motel was just to get ourself together! We been out here panicking the past two weeks and the last 3 days exspecially.
I got blessed this week that's for sure!
Two weeks till payday.
What to do, what to do.
Well I can't just pull another loan, so I gotta make what's left it last. 
Planet fitness anyone???!! 
Okay actually, gas wise, idk, finding safe places to park for extended periods of time, i

In [3]:
print(type(X))
print(type(X_train))
print(type(X_train[0]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'str'>


# TF-IDF

## Logistic Regression

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model_pipeline_lr = Pipeline([('tfidf',TfidfVectorizer()), ('lr',LogisticRegression())])

#trian the model
model_pipeline_lr.fit(X_train, Y_train)

#Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy_lr = accuracy_score(Y_test, y_pred_lr)
precision_lr = precision_score(Y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(Y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(Y_test,y_pred_lr, average='weighted')


In [35]:
classification_report_lr = classification_report(Y_test,y_pred_lr)

print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.61      0.78      0.69       455
not depression       0.64      0.55      0.60       363
        severe       0.71      0.06      0.11        82

      accuracy                           0.62       900
     macro avg       0.66      0.47      0.47       900
  weighted avg       0.63      0.62      0.60       900



## SVM

In [36]:
from sklearn.svm import SVC

model_pipeline_svm = Pipeline([('tfidf',TfidfVectorizer()), ('svm',SVC())])

#trian the model
model_pipeline_svm.fit(X_train, Y_train)

#Predict on the test set
y_pred_svm = model_pipeline_svm.predict(X_test)

In [37]:
classification_report_svm = classification_report(Y_test,y_pred_svm)

print("Evaluation Metrics for SVM Model")
print("--------------------------------")
print(classification_report_svm)

Evaluation Metrics for SVM Model
--------------------------------
                precision    recall  f1-score   support

      moderate       0.61      0.80      0.69       455
not depression       0.66      0.54      0.59       363
        severe       0.60      0.04      0.07        82

      accuracy                           0.62       900
     macro avg       0.62      0.46      0.45       900
  weighted avg       0.63      0.62      0.59       900



## Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

model_pipeline_dt = Pipeline([('tfidf', TfidfVectorizer()),('dt', DecisionTreeClassifier())])

# Train the model
model_pipeline_dt.fit(X_train, Y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

In [39]:
classification_report_dt = classification_report(Y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------------")
print(classification_report_dt)

Evaluation Metrics for Decision Tree Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.61      0.60       455
not depression       0.53      0.52      0.52       363
        severe       0.27      0.26      0.26        82

      accuracy                           0.54       900
     macro avg       0.46      0.46      0.46       900
  weighted avg       0.54      0.54      0.54       900



## Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

model_pipeline_rf = Pipeline([('tfidf', TfidfVectorizer()),('rf', RandomForestClassifier())])

# Train the model
model_pipeline_rf.fit(X_train, Y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

In [41]:
classification_report_rf = classification_report(Y_test, y_pred_rf)

# Print evaluation metrics
print("Evaluation Metrics for Random Forest Model")
print("------------------------------------------------")
print(classification_report_rf)

Evaluation Metrics for Random Forest Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.78      0.68       455
not depression       0.64      0.53      0.58       363
        severe       0.50      0.01      0.02        82

      accuracy                           0.61       900
     macro avg       0.58      0.44      0.43       900
  weighted avg       0.60      0.61      0.58       900



#### After Stopwords removal

In [18]:
import nltk
# nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.', ',', '!', '?', ';', ':'])

def remove_stopwords(text):
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  return ' '.join(filtered_tokens)

df['Review'] = df['Review'].apply(remove_stopwords)
df.head()

Unnamed: 0,PID,Review,Label
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,moderate
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,moderate
2,dev_pid_3,Best suicide method like quick easy deformitie...,moderate
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,moderate
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,moderate


### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

X = df.Review
Y = df.Label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=1)

# TF-IDF

## Logistic Regression

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model_pipeline_lr = Pipeline([('tfidf',TfidfVectorizer()), ('lr',LogisticRegression())])

#trian the model
model_pipeline_lr.fit(X_train, Y_train)

#Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

classification_report_lr = classification_report(Y_test,y_pred_lr)

print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.61      0.77      0.68       455
not depression       0.63      0.55      0.59       363
        severe       0.73      0.10      0.17        82

      accuracy                           0.62       900
     macro avg       0.66      0.47      0.48       900
  weighted avg       0.63      0.62      0.60       900



## SVM

In [22]:
from sklearn.svm import SVC

model_pipeline_svm = Pipeline([('tfidf',TfidfVectorizer()), ('svm',SVC())])

#trian the model
model_pipeline_svm.fit(X_train, Y_train)

#Predict on the test set
y_pred_svm = model_pipeline_svm.predict(X_test)

In [23]:
classification_report_svm = classification_report(Y_test,y_pred_svm)

print("Evaluation Metrics for SVM Model")
print("--------------------------------")
print(classification_report_svm)

Evaluation Metrics for SVM Model
--------------------------------
                precision    recall  f1-score   support

      moderate       0.60      0.80      0.68       455
not depression       0.64      0.50      0.56       363
        severe       0.62      0.06      0.11        82

      accuracy                           0.61       900
     macro avg       0.62      0.45      0.45       900
  weighted avg       0.62      0.61      0.58       900



## Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier

model_pipeline_dt = Pipeline([('tfidf', TfidfVectorizer()),('dt', DecisionTreeClassifier())])

# Train the model
model_pipeline_dt.fit(X_train, Y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

In [26]:
classification_report_dt = classification_report(Y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------")
print(classification_report_dt)

Evaluation Metrics for Decision Tree Model
------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.55      0.57       455
not depression       0.53      0.58      0.55       363
        severe       0.26      0.23      0.25        82

      accuracy                           0.54       900
     macro avg       0.46      0.46      0.46       900
  weighted avg       0.53      0.54      0.53       900



## Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

model_pipeline_rf = Pipeline([('tfidf', TfidfVectorizer()),('rf', RandomForestClassifier())])

# Train the model
model_pipeline_rf.fit(X_train, Y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

In [28]:
classification_report_rf = classification_report(Y_test, y_pred_rf)

# Print evaluation metrics
print("Evaluation Metrics for Random Forest Model")
print("------------------------------------------")
print(classification_report_rf)

Evaluation Metrics for Random Forest Model
------------------------------------------
                precision    recall  f1-score   support

      moderate       0.60      0.80      0.68       455
not depression       0.65      0.52      0.58       363
        severe       0.50      0.01      0.02        82

      accuracy                           0.61       900
     macro avg       0.58      0.44      0.43       900
  weighted avg       0.61      0.61      0.58       900

