# Mini ReadMe

To skip all the code, go to global settings to adjust parameters. Then go to "Keyword Exploration" to see the results and difference between pre-covid and post-covid

# File Directories

In [1]:
input_file_path = "../../../data/twitter/cleaned/final_clean_data.csv"

# Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df_raw = pd.read_csv(input_file_path).drop('Unnamed: 0',axis=1)
df_raw = df_raw.rename(columns={'search':'id_search'})

# Cleaning the searched keywords as it is causing problems with the pd.melt
df_raw['id_search'] = df_raw['id_search'].apply(lambda x: x.strip())
df_raw['id_search'] = df_raw['id_search'].apply(lambda x: x.replace(" ", "_"))

# The first covid case is reported to be on Jan 23 2020
df_raw_pre_covid = df_raw[df_raw['date'] < '2020-01-23']
df_raw_covid = df_raw[df_raw['date'] >= '2020-01-23']

# Groups all the text in a keyword category to find common words associated with the keyword
df_raw_pre_covid_group = df_raw_pre_covid.groupby(['id_search'], as_index=False)['clean_tweet4'].apply(' '.join)
df_raw_covid_group = df_raw_covid.dropna().groupby(['id_search'], as_index=False)['clean_tweet4'].apply(' '.join)

In [4]:
print(df_raw_pre_covid_group.shape)
print(df_raw_covid_group.shape)

(80, 2)
(95, 2)


# Global Settings

In [5]:
ngram_start = 1
ngram_end = 3
max_keywords_rank = 20
keyword = 'depression'

# TF-IDF (Pre-Covid)

In [6]:
# Performing TF-IDF
pre_covid_vectorizer = TfidfVectorizer(ngram_range=(ngram_start,ngram_end),
                            stop_words = 'english',
                            lowercase=True)

df_pre_covid_transformed = pre_covid_vectorizer.fit_transform(df_raw_pre_covid_group['clean_tweet4'])
df_pre_covid_tfidf = pd.DataFrame(df_pre_covid_transformed.toarray(), columns = pre_covid_vectorizer.get_feature_names())

# Merges the TF-IDF output with the main df so as to retrieve the keywords. In this implementation the
# tf-idf array produced does not have the keyword but merely the index location of the keyword
df_pre_covid = pd.concat([df_raw_pre_covid_group, df_pre_covid_tfidf], axis = 1)

# Clean tweets not needed
df_pre_covid = df_pre_covid.drop('clean_tweet4',axis=1)
df_pre_covid = df_pre_covid.reset_index()

# Changes into long form so as to sum the frequencies for every keyword search and the ngram TF-IDF produces
df_pre_covid_long = pd.melt(df_pre_covid, 
                            id_vars='index', 
                            value_vars = df_pre_covid.columns[2:])

df_pre_covid_long = df_pre_covid_long.merge(df_pre_covid[['index','id_search']], 
                                             on=['index'],
                                             how='left')

df_pre_covid_long = df_pre_covid_long.drop('index',axis=1) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
# Filters for the top 20 ngrams per keywrod search and sorts them
df_pre_covid_long['rank'] = (df_pre_covid_long.sort_values('value', ascending=False).groupby(['id_search']).cumcount()+1)
df_pre_covid_long = df_pre_covid_long[df_pre_covid_long['rank'] <= max_keywords_rank]
df_pre_covid_long = df_pre_covid_long.sort_values(['id_search', 'rank'], ascending=[True,True])

# TF-IDF (Covid)

In [9]:
# Performing TF-IDF
covid_vectorizer = TfidfVectorizer(ngram_range=(ngram_start,ngram_end),
                            stop_words = 'english',
                            lowercase=True)

In [10]:
# This block takes a while to run

df_covid_transformed = covid_vectorizer.fit_transform(df_raw_covid_group['clean_tweet4'])
df_covid_tfidf = pd.DataFrame(df_covid_transformed.toarray(), columns = covid_vectorizer.get_feature_names())

# Merges the TF-IDF output with the main df so as to retrieve the keywords. In this implementation the
# tf-idf array produced does not have the keyword but merely the index location of the keyword
df_covid = pd.concat([df_raw_covid_group, df_covid_tfidf], axis = 1)

# Clean tweets not needed
df_covid = df_covid.drop('clean_tweet4',axis=1)
df_covid = df_covid.reset_index()

# Changes into long form so as to sum the frequencies for every keyword search and the ngram TF-IDF produces
df_covid_long = pd.melt(df_covid, 
                            id_vars='level_0', 
                            value_vars = df_covid.columns[2:])

df_covid_long = df_covid_long.merge(df_covid[['level_0','id_search']], 
                                             on=['level_0'],
                                             how='left')

df_covid_long = df_covid_long.drop('level_0',axis=1) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
# Filters for the top 20 ngrams per keywrod search and sorts them
top_n_words_per_category = 20

df_covid_long['rank'] = (df_covid_long.sort_values('value', ascending=False).groupby(['id_search']).cumcount()+1)
df_covid_long = df_covid_long[df_covid_long['rank'] <= max_keywords_rank]
df_covid_long = df_covid_long.sort_values(['id_search', 'rank'], ascending=[True,True])

# Keyword Exploration

In [12]:
# Explore keywords here for pre-covid
df_pre_covid_long[df_pre_covid_long['id_search'] == keyword]

Unnamed: 0,variable,value,id_search,rank
5862254,depression,0.72263,depression,1
1062014,anxiety,0.08544,depression,2
1064094,anxiety depression,0.072865,depression,3
5862974,depression anxiety,0.061047,depression,4
29056014,wspd,0.048838,depression,5
13580334,kisses delavin,0.041517,depression,6
5771934,delavin,0.041517,depression,7
13580254,kisses,0.040149,depression,8
5866014,depression depression,0.036628,depression,9
10194334,great depression,0.036628,depression,10


In [13]:
# Explore keywords here for Covid period
df_covid_long[df_covid_long['id_search'] == keyword]

Unnamed: 0,variable,value,id_search,rank
19123329,depression,0.875086,depression,1
19130549,depression depression,0.054221,depression,2
3142619,anxiety,0.046034,depression,3
31901209,great depression,0.043495,depression,4
19125324,depression anxiety,0.040666,depression,5
19144514,depression like,0.033888,depression,6
19153159,depression really,0.033888,depression,7
41986409,know,0.031588,depression,8
3147274,anxiety depression,0.031068,depression,9
64430254,really,0.027758,depression,10
