In [14]:
import pandas as pd
import numpy as np
import gensim.downloader as api

In [15]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [2]:
wv = api.load('word2vec-google-news-300')



In [9]:
df = pd.read_csv("/home/redleaf/Documents/DUK/NLP/NLP_PROJECT_DEPRESSION/dev_with_labels.tsv", delimiter='\t')
df = df.rename(columns={'Text data':'Review'}, inplace=False)
df.head()

Unnamed: 0,PID,Review,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [15]:
import nltk
# nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.', ',', '!', '?', ';', ':'])

def remove_stopwords(text):
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  return ' '.join(filtered_tokens)

df['Review'] = df['Review'].apply(remove_stopwords)
df.head()

Unnamed: 0,PID,Review,Label
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,moderate
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,moderate
2,dev_pid_3,Best suicide method like quick easy deformitie...,moderate
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,moderate
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,moderate


In [17]:
df['vec'] = df['Review'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))
df.head()

Unnamed: 0,PID,Review,Label,vec
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,moderate,"[-0.16401672, 0.10549937, -0.0007737794, 0.137..."
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,moderate,"[-0.14913188, 0.10237425, 0.0018854521, 0.1347..."
2,dev_pid_3,Best suicide method like quick easy deformitie...,moderate,"[-0.17838252, 0.1166004, 0.02727981, 0.141619,..."
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,moderate,"[-0.16487704, 0.115816586, 0.004556143, 0.1351..."
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,moderate,"[-0.16109459, 0.12144897, 0.0025448436, 0.1294..."


In [18]:
X = np.vstack(df['vec'])
Y = df.Label

### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=1)

## Logistic Regression

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model_pipeline_lr = Pipeline([('lr',LogisticRegression())])

#trian the model
model_pipeline_lr.fit(X_train, Y_train)

#Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

classification_report_lr = classification_report(Y_test,y_pred_lr)

print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.54      0.91      0.68       455
not depression       0.67      0.25      0.37       363
        severe       0.00      0.00      0.00        82

      accuracy                           0.56       900
     macro avg       0.40      0.39      0.35       900
  weighted avg       0.54      0.56      0.49       900



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SVM

In [None]:
from sklearn.svm import SVC

model_pipeline_svm = Pipeline([('tfidf',TfidfVectorizer()), ('svm',SVC())])

#trian the model
model_pipeline_svm.fit(X_train, Y_train)

#Predict on the test set
y_pred_svm = model_pipeline_svm.predict(X_test)

In [None]:
classification_report_svm = classification_report(Y_test,y_pred_svm)

print("Evaluation Metrics for SVM Model")
print("--------------------------------")
print(classification_report_svm)

Evaluation Metrics for SVM Model
--------------------------------
                precision    recall  f1-score   support

      moderate       0.60      0.80      0.68       455
not depression       0.64      0.50      0.56       363
        severe       0.62      0.06      0.11        82

      accuracy                           0.61       900
     macro avg       0.62      0.45      0.45       900
  weighted avg       0.62      0.61      0.58       900



## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_pipeline_dt = Pipeline([('tfidf', TfidfVectorizer()),('dt', DecisionTreeClassifier())])

# Train the model
model_pipeline_dt.fit(X_train, Y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

In [None]:
classification_report_dt = classification_report(Y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------")
print(classification_report_dt)

Evaluation Metrics for Decision Tree Model
------------------------------------------
                precision    recall  f1-score   support

      moderate       0.59      0.55      0.57       455
not depression       0.53      0.58      0.55       363
        severe       0.26      0.23      0.25        82

      accuracy                           0.54       900
     macro avg       0.46      0.46      0.46       900
  weighted avg       0.53      0.54      0.53       900



## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_pipeline_rf = Pipeline([('tfidf', TfidfVectorizer()),('rf', RandomForestClassifier())])

# Train the model
model_pipeline_rf.fit(X_train, Y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

In [None]:
classification_report_rf = classification_report(Y_test, y_pred_rf)

# Print evaluation metrics
print("Evaluation Metrics for Random Forest Model")
print("------------------------------------------")
print(classification_report_rf)

Evaluation Metrics for Random Forest Model
------------------------------------------
                precision    recall  f1-score   support

      moderate       0.60      0.80      0.68       455
not depression       0.65      0.52      0.58       363
        severe       0.50      0.01      0.02        82

      accuracy                           0.61       900
     macro avg       0.58      0.44      0.43       900
  weighted avg       0.61      0.61      0.58       900

