### Import Libraries and Read in Wrangled Data

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import spacy
import nltk
import string
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize 
import re
from bs4 import BeautifulSoup
import unicodedata
from wordcloud import STOPWORDS
import gc
import re
import string
import operator
from collections import defaultdict
from time import time  # To time our operations
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import multiprocessing
from gensim.models import Word2Vec

# from sklearn.pipeline import Pipeline
# from sklearn.naive_bayes import GaussianNB
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# from sklearn.linear_model import LogisticRegression
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.ensemble import RandomForestClassifier

# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

In [2]:
# Covid Tweets
covid_twt_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/covid_tweets.csv")

In [3]:
# General Tweets from 2009
bef_covid_twt_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/sent_tweet.csv")

In [5]:
# # US Gov. response data to COVID
# us_resp_to_covid_df = pd.read_csv("../input/2020-vassar-datafest-josh-data/us_covid_resp.csv")

In [4]:
# # Reddit Depression and non-depression posts for training
# reddit_dep_df = pd.read_csv("../input/reddit-depression-data/preprocessed_data.txt")

### Preprocessing for General Tweets & Covid Tweets (post March)

In [6]:
def clean_text(document):
    
    """
    The clean_text function preprocesses the texts in a document/comment
    
    Parameters
    ----------
    document: the raw text
    
    Returns
    ----------
    tokens: a list of preprocessed tokens
    
    """
    
    document = ' '.join([word.lower() for word in word_tokenize(document)]) # lowercase texts
    tokens = word_tokenize(document) # tokenize the document
    
    for i in range(0,len(tokens)):
        # remove whitespaces
        tokens[i] = tokens[i].strip()
        # remove html links
        tokens[i] = re.sub(r'\S*http\S*', '', tokens[i]) # remove links with http
        tokens[i] = re.sub(r'\S*\.org\S*', '', tokens[i]) # remove links with .org
        tokens[i] = re.sub(r'\S*\.com\S*', '', tokens[i]) # remove links with .com
        
        # remove subreddit titles (e.g /r/food)
        tokens[i] = re.sub(r'S*\/r\/\S*', '' ,tokens[i]) 
        
        # remove non-alphabet characters
        tokens[i] = re.sub("[^a-zA-Z]+", "", tokens[i])
        
        tokens[i] = tokens[i].strip() # remove whitespaces 
        
        # remove all blanks from the list
    while("" in tokens): 
        tokens.remove("") 
     
    return tokens

In [7]:
### Cleaning text for General Tweets pre-covid

# call clean_text on df for each row in df
for i in range(0,len(bef_covid_twt_df)):
    # use clean_text on the document/text stored in the content column
    clean = clean_text(bef_covid_twt_df.loc[i,"text"])
    # joining the tokens together by whitespaces
    bef_covid_twt_df.loc[i,"clean_content"] = ' '.join([token for token in clean])
    
bef_covid_twt_df = bef_covid_twt_df.dropna() # remove null data due to some deleted comments
bef_covid_twt_df = bef_covid_twt_df[bef_covid_twt_df["clean_content"] != ''] # remove blank comments

In [8]:
### Cleaning text for Covid Tweets post March

# call clean_text on df for each row in df
for i in range(0,len(covid_twt_df)):
    # use clean_text on the document/text stored in the content column
    clean2 = clean_text(covid_twt_df.loc[i,"text"])
    # joining the tokens together by whitespaces
    covid_twt_df.loc[i,"clean_content"] = ' '.join([token for token in clean2])
    
covid_twt_df = covid_twt_df.dropna() # remove null data due to some deleted comments
covid_twt_df = covid_twt_df[covid_twt_df["clean_content"] != ''] # remove blank comments

### Word2vec - build vocabs + train

In [12]:
from gensim.models.phrases import Phrases, Phraser

sent1 = [row.split() for row in bef_covid_twt_df['clean_content']]
phrases1 = Phrases(sent1, min_count=30, progress_per=10000)

bigram1 = Phraser(phrases1)
sentences1 = bigram1[sent1]

In [14]:
sent2 = [row.split() for row in covid_twt_df['clean_content']]
phrases2 = Phrases(sent2, min_count=30, progress_per=10000)

bigram2 = Phraser(phrases2)
sentences2 = bigram1[sent2]

In [15]:
# setting up parameters for word2vec models

cores = multiprocessing.cpu_count( ) # Count the number of cores in a computer

w2v_model1 = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

w2v_model2 = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [16]:
# build word2vec vocabs for pre-covid tweets

t = time()

w2v_model1.build_vocab(sentences1, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.04 mins


In [17]:
# build word2vec vocabs for covid tweets

t = time()

w2v_model2.build_vocab(sentences2, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.08 mins


In [18]:
# word2vec training for pre-covid tweets

t = time( )

w2v_model1.train(sentences1, total_examples=w2v_model1.corpus_count, epochs=30, report_delay=1)

print('Time to train the model for pre-covid tweets: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model for pre-covid tweets: 1.04 mins


In [19]:
# word2vec training for covid tweets

t = time( )

w2v_model2.train(sentences2, total_examples=w2v_model2.corpus_count, epochs=30, report_delay=1)

print('Time to train the model for covid tweets: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model for covid tweets: 1.8 mins


In [32]:
# words associated with word "depressed" in pre-covid tweets

w2v_model1.wv.most_similar(positive=["depressed"],topn=50)

[('feelin', 0.9882292151451111),
 ('shit', 0.9845561981201172),
 ('a_bit', 0.9830614328384399),
 ('ass', 0.9805903434753418),
 ('being', 0.9800398945808411),
 ('scared', 0.9786115288734436),
 ('too_much', 0.978432834148407),
 ('trying', 0.9778023958206177),
 ('stupid', 0.9772000312805176),
 ('aint', 0.9769729971885681),
 ('hate', 0.9766879081726074),
 ('slow', 0.9765041470527649),
 ('coz', 0.9761098623275757),
 ('feel_like', 0.9755951762199402),
 ('pissed', 0.9753478765487671),
 ('barely', 0.9749050736427307),
 ('sooooo', 0.9748597741127014),
 ('freaking', 0.9748213291168213),
 ('ive', 0.9735394716262817),
 ('badly', 0.9730440974235535),
 ('pain', 0.972820520401001),
 ('bitch', 0.9726149439811707),
 ('happening', 0.9724913835525513),
 ('havent', 0.9717068672180176),
 ('felt', 0.9717012047767639),
 ('dying', 0.9709842801094055),
 ('ughh', 0.9706401228904724),
 ('starving', 0.9705080389976501),
 ('jus', 0.9703129529953003),
 ('thinking', 0.9700263142585754),
 ('lonely', 0.969255924224853

In [33]:
# words associated with word "depression" in pre-covid tweets

w2v_model2.wv.most_similar(positive=["depression"], topn=50)

[('anxiety', 0.7966477870941162),
 ('grief', 0.7390024065971375),
 ('stress', 0.6957149505615234),
 ('hiv', 0.6594191193580627),
 ('medtwitter', 0.6236761808395386),
 ('dealing', 0.5988633632659912),
 ('coping', 0.5953640341758728),
 ('fantastic', 0.5952185988426208),
 ('empathy', 0.5915783643722534),
 ('inspired', 0.5849552750587463),
 ('abuse', 0.5780466198921204),
 ('knowledge', 0.5768439769744873),
 ('ebola', 0.5749789476394653),
 ('kills', 0.5742815732955933),
 ('terrible', 0.5715780854225159),
 ('studies', 0.5651646256446838),
 ('emotional', 0.5645834803581238),
 ('image', 0.5640422701835632),
 ('survival', 0.5627989768981934),
 ('teamwork', 0.5625821352005005),
 ('survivor', 0.5575640797615051),
 ('understanding', 0.5552408695220947),
 ('version', 0.5548352003097534),
 ('child', 0.5542569160461426),
 ('position', 0.5504550337791443),
 ('present', 0.546249270439148),
 ('loss', 0.5460538864135742),
 ('damage', 0.5422340035438538),
 ('selfcare', 0.540926992893219),
 ('compassion', 