In [17]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [18]:
review = pd.read_csv('dev_with_labels.tsv', delimiter='\t')
review = review.rename(columns={'Text data': 'data'},)

In [19]:
# Load spaCy model and define stop words
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [20]:
# Load pre-trained GloVe word vectors
wv = api.load('glove-twitter-100')

In [21]:
# Define the tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens


In [22]:
# Tokenize the text data
review['data1'] = review['data'].apply(spacy_tokenizer)

In [23]:
wv.vector_size

100

In [24]:
# Generate word vectors for each document
review['vec'] = review['data1'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

In [25]:
review.head()

Unnamed: 0,PID,data,Label,data1,vec
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate,"[enjoy, today, tomorrow, depression, wait, tod...","[0.051370677, 0.12030114, 0.087466985, -0.0455..."
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate,"[sorta, try, kill, total, breakdown, fucking, ...","[0.006347649, 0.04428071, 0.11420649, -0.04335..."
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate,"[good, suicide, method, like, quick, easy, def...","[0.21656814, -0.19113775, -0.042504627, -0.029..."
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate,"[story, remember, time, 3ds, play, nintendogs,...","[0.14247186, 0.10691145, 0.1261191, -0.0971992..."
4,dev_pid_5,The world only cares about beautiful people : ...,moderate,"[world, care, beautiful, people, bear, ugly, u...","[-0.044080853, -0.04023238, -0.027201422, 0.27..."


In [26]:
# Split the dataset into features (X) and labels (y)
X = np.vstack(review['vec'])  # Convert list of arrays to a matrix
y = review['Label']

In [27]:
X.shape

(4496, 100)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [29]:
# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('lr', LogisticRegression(max_iter=1000))  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [30]:
# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print(accuracy_lr)
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

0.5755555555555556
Evaluation Metrics for Logistic Regression Model
------------------------------------------------
                precision    recall  f1-score   support

      moderate       0.57      0.82      0.67       455
not depression       0.60      0.37      0.46       363
        severe       0.47      0.11      0.18        82

      accuracy                           0.58       900
     macro avg       0.55      0.43      0.44       900
  weighted avg       0.57      0.58      0.54       900

