### Import Libraries and Read in Wrangled Data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import spacy
import nltk
import string
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize 
import re
from bs4 import BeautifulSoup
import unicodedata
from wordcloud import STOPWORDS
import gc
import re
import string
import operator
from collections import defaultdict

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

In [2]:
# Covid Tweets
covid_twt_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/covid_tweets.csv")

In [3]:
# General Tweets from 2009
bef_covid_twt_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/sent_tweet.csv")

In [4]:
# US Gov. response data to COVID
us_resp_to_covid_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/us_covid_resp.csv")

In [5]:
# Reddit Depression and non-depression posts for training
reddit_dep_df = pd.read_csv("../input/reddit-depression-data/preprocessed_data.txt")

### Preprocessing for General Tweets & Covid Tweets (post March)

In [6]:
def clean_text(document):
    
    """
    The clean_text function preprocesses the texts in a document/comment
    
    Parameters
    ----------
    document: the raw text
    
    Returns
    ----------
    tokens: a list of preprocessed tokens
    
    """
    
    document = ' '.join([word.lower() for word in word_tokenize(document)]) # lowercase texts
    tokens = word_tokenize(document) # tokenize the document
    
    for i in range(0,len(tokens)):
        # remove whitespaces
        tokens[i] = tokens[i].strip()
        # remove html links
        tokens[i] = re.sub(r'\S*http\S*', '', tokens[i]) # remove links with http
        tokens[i] = re.sub(r'\S*\.org\S*', '', tokens[i]) # remove links with .org
        tokens[i] = re.sub(r'\S*\.com\S*', '', tokens[i]) # remove links with .com
        
        # remove subreddit titles (e.g /r/food)
        tokens[i] = re.sub(r'S*\/r\/\S*', '' ,tokens[i]) 
        
        # remove non-alphabet characters
        tokens[i] = re.sub("[^a-zA-Z]+", "", tokens[i])
        
        tokens[i] = tokens[i].strip() # remove whitespaces 
        
        # remove all blanks from the list
    while("" in tokens): 
        tokens.remove("") 
     
    return tokens

In [7]:
### Cleaning text for General Tweets pre-covid

# call clean_text on df for each row in df
for i in range(0,len(bef_covid_twt_df)):
    # use clean_text on the document/text stored in the content column
    clean = clean_text(bef_covid_twt_df.loc[i,"text"])
    # joining the tokens together by whitespaces
    bef_covid_twt_df.loc[i,"clean_content"] = ' '.join([token for token in clean])
    
bef_covid_twt_df = bef_covid_twt_df.dropna() # remove null data due to some deleted comments
bef_covid_twt_df = bef_covid_twt_df[bef_covid_twt_df["clean_content"] != ''] # remove blank comments

In [8]:
### Cleaning text for Covid Tweets post March

# call clean_text on df for each row in df
for i in range(0,len(covid_twt_df)):
    # use clean_text on the document/text stored in the content column
    clean2 = clean_text(covid_twt_df.loc[i,"text"])
    # joining the tokens together by whitespaces
    covid_twt_df.loc[i,"clean_content"] = ' '.join([token for token in clean2])
    
covid_twt_df = covid_twt_df.dropna() # remove null data due to some deleted comments
covid_twt_df = covid_twt_df[covid_twt_df["clean_content"] != ''] # remove blank comments

### Creating Meta Features

In [9]:
### Reddit Data

# word_count
reddit_dep_df['wc'] = reddit_dep_df['clean_content'].apply(lambda x: len(str(x).split()))

# unique_word_count
reddit_dep_df['unique_wc'] = reddit_dep_df['clean_content'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
reddit_dep_df['stop_wc'] = reddit_dep_df['content'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# mean_word_length
reddit_dep_df['mean_wl'] = reddit_dep_df['clean_content'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# median_word_length
reddit_dep_df['median_wl'] = reddit_dep_df['clean_content'].apply(lambda x: np.median([len(w) for w in str(x).split()]))

# char_count
reddit_dep_df['cc'] = reddit_dep_df['clean_content'].apply(lambda x: len(str(x)))

# punctuation_count
reddit_dep_df['pc'] = reddit_dep_df['content'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [10]:
### General Tweets before Covid

# word_count
bef_covid_twt_df['wc'] = bef_covid_twt_df['clean_content'].apply(lambda x: len(str(x).split()))

# unique_word_count
bef_covid_twt_df['unique_wc'] = bef_covid_twt_df['clean_content'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
bef_covid_twt_df['stop_wc'] = bef_covid_twt_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# mean_word_length
bef_covid_twt_df['mean_wl'] = bef_covid_twt_df['clean_content'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# median_word_length
bef_covid_twt_df['median_wl'] = bef_covid_twt_df['clean_content'].apply(lambda x: np.median([len(w) for w in str(x).split()]))

# char_count
bef_covid_twt_df['cc'] = bef_covid_twt_df['clean_content'].apply(lambda x: len(str(x)))

# punctuation_count
bef_covid_twt_df['pc'] = bef_covid_twt_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [11]:
### COVID Tweets

# word_count
covid_twt_df['wc'] = covid_twt_df['clean_content'].apply(lambda x: len(str(x).split()))

# unique_word_count
covid_twt_df['unique_wc'] = covid_twt_df['clean_content'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
covid_twt_df['stop_wc'] = covid_twt_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# mean_word_length
covid_twt_df['mean_wl'] = covid_twt_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# median_word_length
covid_twt_df['median_wl'] = covid_twt_df['clean_content'].apply(lambda x: np.median([len(w) for w in str(x).split()]))

# char_count
covid_twt_df['cc'] = covid_twt_df['clean_content'].apply(lambda x: len(str(x)))

# punctuation_count
covid_twt_df['pc'] = covid_twt_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

### Depression Classifier Model using Reddit Data - n-grams / tfidf + Naive Bayes

#### Train Reddit Data

In [12]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
countvec_features = countvec.fit_transform(reddit_dep_df['clean_content'])

In [13]:
# tf-idf features 
tfidf = TfidfVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
tfidf_features = tfidf.fit_transform(reddit_dep_df['clean_content'])

In [14]:
# n-gram features + tf-idf features + Meta Features I created
feats = pd.concat([reddit_dep_df.iloc[:,3:], pd.DataFrame(countvec_features.toarray()), pd.DataFrame(tfidf_features.toarray())], axis=1)

In [15]:
nb = GaussianNB()

In [16]:
# Labels
labels = reddit_dep_df.label

In [17]:
print("Naive Bayes: CV Accuracy ", cross_val_score(nb, feats, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes: CV AUC Score ", cross_val_score(nb, feats, labels, scoring='roc_auc', cv=5).mean())

print("Naive Bayes: CV F1 Score ", cross_val_score(nb, feats, labels, scoring='f1', cv=5).mean())

Naive Bayes: CV Accuracy  0.7785929060963495
Naive Bayes: CV AUC Score  0.8903250350059306
Naive Bayes: CV F1 Score  0.7450338803981348


In [18]:
%%time
nb.fit(feats, labels)

CPU times: user 16 ms, sys: 1 ms, total: 17 ms
Wall time: 15 ms


GaussianNB(priors=None, var_smoothing=1e-09)

#### General Tweets in 2009

In [19]:
# General Tweets post covid : n-gram features

gtwt_countvec = CountVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
gtwt_countvec_features = gtwt_countvec.fit_transform(bef_covid_twt_df['clean_content'])

In [20]:
# General Tweets post covid : tf-idf features 
gtwt_tfidf = TfidfVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
gtwt_tfidf_features = gtwt_tfidf.fit_transform(bef_covid_twt_df['clean_content'])

In [21]:
# General Tweets post covid : n-gram features + tf-idf features + Meta Features I created
gtwt_feats = pd.concat([bef_covid_twt_df.iloc[:,3:], pd.DataFrame(gtwt_countvec_features.toarray()), pd.DataFrame(gtwt_tfidf_features.toarray())], axis=1)

In [22]:
bef_covid_twt_df['nb_predicted_depression'] = nb.predict(gtwt_feats)

In [23]:
bef_covid_twt_df.nb_predicted_depression.value_counts() * 100 / len(bef_covid_twt_df)

0    90.844697
1     9.155303
Name: nb_predicted_depression, dtype: float64

#### COVID tweets

In [24]:
# Covid Tweets : n-gram features

covid_twt_countvec = CountVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
covid_twt_countvec_features = covid_twt_countvec.fit_transform(covid_twt_df['clean_content'])

In [25]:
# Covid Tweets : tf-idf features 
covid_twt_tfidf = TfidfVectorizer(ngram_range=(1,5),max_features=100,analyzer='word')
covid_twt_tfidf_features = covid_twt_tfidf.fit_transform(covid_twt_df['clean_content'])

In [26]:
# Covid Tweets : n-gram features + tf-idf features + Meta Features I created
covid_twt_feats = pd.concat([covid_twt_df.iloc[:,12:], pd.DataFrame(covid_twt_countvec_features.toarray()), pd.DataFrame(covid_twt_tfidf_features.toarray())], axis=1)

In [27]:
covid_twt_df['nb_predicted_depression'] = nb.predict(covid_twt_feats)

In [28]:
covid_twt_df.nb_predicted_depression.value_counts() * 100 / len(covid_twt_df)

0    93.463595
1     6.536405
Name: nb_predicted_depression, dtype: float64

### Depression Classifier Model using Reddit Data - n-grams / tf-idf + Logistic Regression

#### Reddit Data

In [34]:
logit = LogisticRegression(max_iter=10000)

In [35]:
print("Logistic Regression: CV Accuracy ", cross_val_score(logit, feats, labels, scoring='accuracy', cv=5).mean())

print("Logistic Regression: CV AUC Score ", cross_val_score(logit, feats, labels, scoring='roc_auc', cv=5).mean())

print("Logistic Regression: CV F1 Score ", cross_val_score(logit, feats, labels, scoring='f1', cv=5).mean())

Logistic Regression: CV Accuracy  0.8841777065437881
Logistic Regression: CV AUC Score  0.9504855819507417
Logistic Regression: CV F1 Score  0.8811411231154604


In [36]:
# Train the model
logit.fit(feats, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Before Covid - General Tweets

In [39]:
bef_covid_twt_df['lr_predicted_depression'] = logit.predict(gtwt_feats)

In [40]:
bef_covid_twt_df.lr_predicted_depression.value_counts() * 100 / len(bef_covid_twt_df)

0    73.416667
1    26.583333
Name: lr_predicted_depression, dtype: float64

#### Covid Tweets

In [37]:
covid_twt_df['lr_predicted_depression'] = logit.predict(covid_twt_feats)

In [38]:
covid_twt_df.lr_predicted_depression.value_counts() * 100 / len(covid_twt_df)

0    83.924311
1    16.075689
Name: lr_predicted_depression, dtype: float64

### Depression Classifier Model using Reddit Data - n-grams / tf-idf + Random Forest

#### Reddit Data

In [42]:
rf = RandomForestClassifier(n_estimators = 1000)

In [43]:
print("RF: CV Accuracy ", cross_val_score(rf, feats, labels, scoring='accuracy', cv=5).mean())

print("RF: CV AUC Score ", cross_val_score(rf, feats, labels, scoring='roc_auc', cv=5).mean())

print("RF: CV F1 Score ", cross_val_score(rf, feats, labels, scoring='f1', cv=5).mean())

RF: CV Accuracy  0.9130466721761706
RF: CV AUC Score  0.9752173370877465
RF: CV F1 Score  0.9145526427215346


In [44]:
%%time
# Train the model
rf.fit(feats, labels)

CPU times: user 22.9 s, sys: 21.9 ms, total: 22.9 s
Wall time: 22.9 s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### General Tweets

In [45]:
bef_covid_twt_df['rf_predicted_depression'] = rf.predict(gtwt_feats)

In [46]:
bef_covid_twt_df.rf_predicted_depression.value_counts() * 100 / len(bef_covid_twt_df)

0    82.314394
1    17.685606
Name: rf_predicted_depression, dtype: float64

#### Covid Tweets

In [47]:
covid_twt_df['rf_predicted_depression'] = rf.predict(covid_twt_feats)

In [48]:
covid_twt_df.rf_predicted_depression.value_counts() * 100 / len(covid_twt_df)

0    82.986425
1    17.013575
Name: rf_predicted_depression, dtype: float64

### Depression Classifier Model using Reddit Data - n-grams / tf-idf + LGBM

#### Reddit Data

In [49]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='binary', random_state=42)

In [51]:
%%time
print("LGBM: CV Accuracy ", cross_val_score(lgbm, feats, labels, scoring='accuracy', cv=5).mean())

print("LGBM: CV AUC Score ", cross_val_score(lgbm, feats, labels, scoring='roc_auc', cv=5).mean())

print("LGBM: CV F1 Score ", cross_val_score(lgbm, feats, labels, scoring='f1', cv=5).mean())

LGBM: CV Accuracy  0.930948385965791
LGBM: CV AUC Score  0.9802327574901977
LGBM: CV F1 Score  0.931007008505885
CPU times: user 38min 7s, sys: 15.3 s, total: 38min 22s
Wall time: 9min 47s


In [52]:
%%time
# Train the model
lgbm.fit(feats, labels)

CPU times: user 2min 58s, sys: 1.12 s, total: 2min 59s
Wall time: 45.6 s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.03, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=5000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

#### General Tweets

In [53]:
bef_covid_twt_df['lgbm_predicted_depression'] = lgbm.predict(gtwt_feats)

In [54]:
bef_covid_twt_df.lgbm_predicted_depression.value_counts() * 100 / len(bef_covid_twt_df)

0    79.837121
1    20.162879
Name: lgbm_predicted_depression, dtype: float64

#### Covid Tweets

In [55]:
covid_twt_df['lgbm_predicted_depression'] = lgbm.predict(covid_twt_feats)

In [56]:
covid_twt_df.lgbm_predicted_depression.value_counts() * 100 / len(covid_twt_df)

0    82.077334
1    17.922666
Name: lgbm_predicted_depression, dtype: float64

### Majority Voting

#### General Tweets

In [69]:
# Sum up the classifier voting
bef_covid_twt_df['sum'] = \
bef_covid_twt_df['nb_predicted_depression'] + bef_covid_twt_df['lr_predicted_depression'] + bef_covid_twt_df['rf_predicted_depression'] + bef_covid_twt_df['lgbm_predicted_depression']

In [70]:
# If three or more classifiers voted for depression, then mark it as depression(1). If only one or non classifier voted for depression, then mark it non-depression(0)
# For ties, follow the best performing model which is the LGBM model

bef_covid_twt_df['majority_voting_predicted_depression'] = np.where(bef_covid_twt_df['sum'] >= 3, 1, 0)
bef_covid_twt_df['majority_voting_predicted_depression']  = np.where(bef_covid_twt_df['sum'] == 2, bef_covid_twt_df['lgbm_predicted_depression'], 
                                                                 bef_covid_twt_df['majority_voting_predicted_depression'])

In [71]:
bef_covid_twt_df.majority_voting_predicted_depression.value_counts() * 100 / len(bef_covid_twt_df)

0    80.625
1    19.375
Name: majority_voting_predicted_depression, dtype: float64

#### Covid Tweets

In [60]:
# Sum up the classifier voting
covid_twt_df['sum'] = \
covid_twt_df['nb_predicted_depression'] + covid_twt_df['lr_predicted_depression'] + covid_twt_df['rf_predicted_depression'] + covid_twt_df['lgbm_predicted_depression']

In [66]:
# If three or more classifiers voted for depression, then mark it as depression(1). If only one or non classifier voted for depression, then mark it non-depression(0)
# For ties, follow the best performing model which is the LGBM model

covid_twt_df['majority_voting_predicted_depression'] = np.where(covid_twt_df['sum'] >= 3, 1, 0)
covid_twt_df['majority_voting_predicted_depression']  = np.where(covid_twt_df['sum'] == 2, covid_twt_df['lgbm_predicted_depression'], 
                                                                 covid_twt_df['majority_voting_predicted_depression'])

In [68]:
covid_twt_df.majority_voting_predicted_depression.value_counts() * 100 / len(covid_twt_df)

0    84.430276
1    15.569724
Name: majority_voting_predicted_depression, dtype: float64

### Save Files

In [72]:
bef_covid_twt_df.to_csv("tweets_in_2009_with_dep_labels.csv")

In [73]:
covid_twt_df.to_csv("tweets_in_2020_covid_with_dep_labels.csv")