# Mini ReadMe

To skip all the code, go to global settings to adjust parameters. Then go to "Keyword Exploration" to see the results and difference between pre-covid and post-covid

# File Directories

In [1]:
input_file_path = "../../../data/twitter/cleaned/final_clean_data.csv"

# Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df_raw = pd.read_csv(input_file_path).drop('Unnamed: 0',axis=1)
df_raw = df_raw.rename(columns={'search':'id_search'})

# Word Filtering

In [4]:
# remove this block if you want to see the keywords picked up by TF-IDF
# This is remove terms like depression in tweets that is retrieved
# with the search keywords
for keyword in list(df_raw['id_search'].unique()):
    df_raw[df_raw['id_search'] == keyword]
    df_raw['clean_tweet4'] = df_raw.clean_tweet4.str.lower().replace(keyword,'', regex=True)

    
# Remove rows with these stopwords
search_words = ['nikerunning', 'dec', 'delavin',
                'kisses', 'agrunningmoms', 'migraineinsg',
                'jordan', 'munchausen', 'concert',
                'wspd', 'comedy', 'gentlemonstersunglasses', 
                'gentlemonster', 'youtube', 'star', 'place', 
                'comedy', 'poetry', 'want', 'know', 'diagrams', 
                'pared', 'owls', 'riser', 'street', 'style', 'buat',
                'phd', 'hi', 'im', 'gold', 'standard', 'yeah', 'watch', 
                'arena', 'things', 'teka', 'gt', 'matter', ]

df_raw = df_raw[~df_raw['clean_tweet4'].str.contains('|'.join(search_words), na=False)] #ignore NaN else the code breaks

    
# Cleaning the searched keywords as it is causing problems with the pd.melt
df_raw['id_search'] = df_raw['id_search'].apply(lambda x: x.strip())
df_raw['id_search'] = df_raw['id_search'].apply(lambda x: x.replace(" ", "_"))        

In [5]:
# The first covid case is reported to be on Jan 23 2020
df_raw_pre_covid = df_raw[df_raw['date'] < '2020-01-23']
df_raw_covid = df_raw[df_raw['date'] >= '2020-01-23']

# Groups all the text in a keyword category to find common words associated with the keyword
df_raw_pre_covid_group = df_raw_pre_covid.groupby(['id_search'], as_index=False)['clean_tweet4'].apply(' '.join)
df_raw_covid_group = df_raw_covid.dropna().groupby(['id_search'], as_index=False)['clean_tweet4'].apply(' '.join)

In [6]:
print(df_raw_pre_covid_group.shape)
print(df_raw_covid_group.shape)

(78, 2)
(95, 2)


# Global Settings

In [7]:
print(df_raw['id_search'].unique())

['depression' 'mental_illness' 'social_anxiety' 'loneliness' 'stress'
 'lonely' 'isolation' 'suicide' 'abuse' 'death' 'no_motivation' 'therapy'
 'trauma' 'counselling' 'mood_swings' 'mental_health' 'angst' 'emotion'
 'phobia' 'addiction' 'stigma' 'self-harm' 'disorder' 'dependence'
 'socialize' 'help' 'dead' 'tired' 'trapped' 'paranoia' 'overwhelmed'
 'irritable' 'bipolar' 'psychologist' 'well-being' 'imh' 'sos'
 'counsellor' 'toxic' 'insominia' 'drugs' 'fight' 'self-esteem' 'unalive'
 'melancholia' 'pandemic' 'circuit_breaker' 'quarantine' 'wfh' 'phase_2'
 'vaccine' 'epidemic' 'mask' 'contactless' 'national_emergency'
 'transmission' 'screening' 'testing' 'hospital' 'symptomatic' 'symptoms'
 'outbreak' 'cluster' 'spread' 'hygiene' 'crisis' 'new_normal'
 'uncertainty' 'zoom' 'retrenchment' 'variant' 'shn' 'hbl' 'safe_entry'
 'frontline' 'school_closure' 'job_insecurity' 'lockdown' 'coronavirus'
 'asymptomatic' 'contact_tracing' 'restriction' 'social_distancing'
 'fatality_rate' 'stay_h

In [8]:
ngram_start = 3
ngram_end = 5
max_keywords_rank = 20
keyword = 'suicide' # Run this block and then all the codes below

# TF-IDF (Pre-Covid)

In [9]:
# Performing TF-IDF
pre_covid_vectorizer = TfidfVectorizer(ngram_range=(ngram_start,ngram_end),
                            stop_words = 'english',
                            lowercase=True)

df_pre_covid_transformed = pre_covid_vectorizer.fit_transform(df_raw_pre_covid_group['clean_tweet4'])
df_pre_covid_tfidf = pd.DataFrame(df_pre_covid_transformed.toarray(), columns = pre_covid_vectorizer.get_feature_names())

# Merges the TF-IDF output with the main df so as to retrieve the keywords. In this implementation the
# tf-idf array produced does not have the keyword but merely the index location of the keyword
df_pre_covid = pd.concat([df_raw_pre_covid_group, df_pre_covid_tfidf], axis = 1)

# Clean tweets not needed
df_pre_covid = df_pre_covid.drop('clean_tweet4',axis=1)
df_pre_covid = df_pre_covid.reset_index()

# Changes into long form so as to sum the frequencies for every keyword search and the ngram TF-IDF produces
df_pre_covid_long = pd.melt(df_pre_covid, 
                            id_vars='index', 
                            value_vars = df_pre_covid.columns[2:])

df_pre_covid_long = df_pre_covid_long.merge(df_pre_covid[['index','id_search']], 
                                             on=['index'],
                                             how='left')

df_pre_covid_long = df_pre_covid_long.drop('index',axis=1) 

In [10]:
# Filters for the top 20 ngrams per keywrod search and sorts them
df_pre_covid_long['rank'] = (df_pre_covid_long.sort_values('value', ascending=False).groupby(['id_search']).cumcount()+1)
df_pre_covid_long = df_pre_covid_long[df_pre_covid_long['rank'] <= max_keywords_rank]
df_pre_covid_long = df_pre_covid_long.sort_values(['id_search', 'rank'], ascending=[True,True])

# TF-IDF (Covid)

In [11]:
# Performing TF-IDF
covid_vectorizer = TfidfVectorizer(ngram_range=(ngram_start,ngram_end),
                            stop_words = 'english',
                            lowercase=True)

# This block takes a while to run
df_covid_transformed = covid_vectorizer.fit_transform(df_raw_covid_group['clean_tweet4'])
df_covid_tfidf = pd.DataFrame(df_covid_transformed.toarray(), columns = covid_vectorizer.get_feature_names())

# Merges the TF-IDF output with the main df so as to retrieve the keywords. In this implementation the
# tf-idf array produced does not have the keyword but merely the index location of the keyword
df_covid = pd.concat([df_raw_covid_group, df_covid_tfidf], axis = 1)

# Clean tweets not needed
df_covid = df_covid.drop('clean_tweet4',axis=1)
df_covid = df_covid.reset_index()

# Changes into long form so as to sum the frequencies for every keyword search and the ngram TF-IDF produces
df_covid_long = pd.melt(df_covid, 
                        id_vars=df_covid.columns[0], 
                        value_vars = df_covid.columns[2:])

df_covid_long = df_covid_long.merge(df_covid[df_covid.columns[0:2]], 
                                    on=df_covid.columns[0],
                                    how='left')


df_covid_long = df_covid_long.drop(df_covid_long.columns[0],axis=1) 

In [12]:
# Filters for the top 20 ngrams per keywrod search and sorts them
top_n_words_per_category = 20

df_covid_long['rank'] = (df_covid_long.sort_values('value', ascending=False).groupby(['id_search']).cumcount()+1)
df_covid_long = df_covid_long[df_covid_long['rank'] <= max_keywords_rank]
df_covid_long = df_covid_long.sort_values(['id_search', 'rank'], ascending=[True,True])

# Keyword Exploration

In [13]:
# Explore keywords here for pre-covid
df_pre_covid_long[df_pre_covid_long['id_search'] == keyword]

Unnamed: 0,variable,value,id_search,rank
9800761,people die die,0.080517,suicide,1
7477141,like squad coffees dah,0.040258,suicide,2
15186895,yi better explore forest,0.040258,suicide,3
8714923,movie assault arkham,0.040258,suicide,4
14047003,typing note various fonts signing,0.040258,suicide,5
12798691,stop save life motto,0.040258,suicide,6
12654157,squad voyeurism stories,0.040258,suicide,7
572035,anxiety plotting seaandpal,0.040258,suicide,8
7787035,lost babe meds suicidal doctor,0.040258,suicide,9
11335099,run fort canning zig,0.040258,suicide,10


In [14]:
# Explore keywords here for Covid period
df_covid_long[df_covid_long['id_search'] == keyword]

Unnamed: 0,variable,value,id_search,rank
38092322,press ups raise awareness,0.06381,suicide,1
38773092,ptsd hurting reach,0.06381,suicide,2
39277162,raise awareness anxiety,0.06381,suicide,3
38092227,press ups raise,0.06381,suicide,4
38176492,prevention prevention line,0.06381,suicide,5
39277352,raise awareness anxiety ptsd hurting,0.06381,suicide,6
39277257,raise awareness anxiety ptsd,0.06381,suicide,7
1942922,anxiety ptsd hurting,0.06381,suicide,8
11454037,days press ups raise awareness,0.06381,suicide,9
1943017,anxiety ptsd hurting reach,0.06381,suicide,10
