 <center><h1> Data for Visualization


This notebook shows the code for preparing the processed data for visualization.

## Trending Hashtags

In [2]:
# Import necessary libraries

import pandas as pd
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS) #Set of English Stopwords

import numpy as npy
from PIL import Image

maskArray = npy.array(Image.open("mask.png")) # Twitter Logo as a mask for wordcloud

In [3]:
# Import relevant files

sent_df = pd.read_csv("Processed Data.csv",sep='\t')

In [40]:
# Read the data
hash_df = sent_df.loc[:,['date','hashtags']]
hash_df = hash_df[pd.notna(hash_df['hashtags'])]
hash_df.sample(5)

Unnamed: 0,date,hashtags
106147,2020-04-03,#IndiaFightsCorona
97350,2020-04-02,#Corona #CoronavirusOutbreakindia
541724,2020-06-12,#ThankYouHeroes #Toys #Kids #MommyBabyTimes
528252,2020-06-11,#GlobalGradShow
100381,2020-04-03,#GlobalCeasefire #Covid19Pandemic #Coronavirus


In [6]:
#Split the data into different lockdown phases, find the top 10 hashtags and save the data

lockdown1 = hash_df[(hash_df['date'] >= '2020-03-25') & (hash_df['date'] <= '2020-04-14')]
lockdown1 = pd.DataFrame(lockdown1['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown1.reset_index(inplace=True)
lockdown1.insert(0, "Phase", "LD1")
lockdown1.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown2 = hash_df[(hash_df['date'] >= '2020-04-15') & (hash_df['date'] <= '2020-05-03')]
lockdown2 = pd.DataFrame(lockdown2['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown2.reset_index(inplace=True)
lockdown2.insert(0, "Phase", "LD2")
lockdown2.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown3 = hash_df[(hash_df['date'] >= '2020-05-04') & (hash_df['date'] <= '2020-05-17')]
lockdown3 = pd.DataFrame(lockdown3['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown3.reset_index(inplace=True)
lockdown3.insert(0, "Phase", "LD3")
lockdown3.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown4 = hash_df[(hash_df['date'] >= '2020-05-18') & (hash_df['date'] <= '2020-05-31')]
lockdown4 = pd.DataFrame(lockdown4['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown4.reset_index(inplace=True)
lockdown4.insert(0, "Phase", "LD4")
lockdown4.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

unlock1 = hash_df[(hash_df['date'] >= '2020-06-01') & (hash_df['date'] <= '2020-06-14')]
unlock1 = pd.DataFrame(unlock1['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
unlock1.reset_index(inplace=True)
unlock1.insert(0, "Phase", "Unlock1")
unlock1.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

combined_df = pd.concat([lockdown1,lockdown2,lockdown3,lockdown4,unlock1],ignore_index=True)
combined_df.to_csv("Hashtag data.csv",index=False)

## Wordcloud

In [12]:
# Read the data

df = pd.read_csv("Processed Data.csv",sep='\t')
df['processed_text']=df['processed_text'].astype(str)
df.sample(3)

Unnamed: 0,date,time,username,to,replies,retweets,favorites,text,mentions,hashtags,id,permalink,processed_text
508701,2020-06-07,15:35:48,AnshGup10217390,,0,0,1,@PMOIndia @myogioffice @nitin_gadkari Dear sir...,@PMOIndia @myogioffice @nitin_gadkari,,1269654299409289216,https://twitter.com/AnshGup10217390/status/126...,USER_MENTION USER_MENTION USER_MENTION dear si...
481326,2020-05-31,10:24:02,jay_arrah,,0,1,3,Two educational institutions have become pione...,@achyuta_samanta @KIITUniversity @kissfoundation,#KIITKISSFightsCovid19,1267039125829382144,https://twitter.com/jay_arrah/status/126703912...,two educational institution become pioneer con...
443442,2020-05-22,12:35:07,danizaydi,,2,12,29,DANI’S CORONA RAYA COMING THIS EID 2020,,,1263810621604163584,https://twitter.com/danizaydi/status/126381062...,corona raya coming eid


In [13]:
# Remove unnecessary data

cloud_df = df.loc[:,['date','processed_text']]
cloud_df

Unnamed: 0,date,processed_text
0,2020-03-25,yeah missing freedom life covid19
1,2020-03-25,contribute cm relief fund help delhi govt figh...
2,2020-03-25,bhai assalamualaikum possible please call bhai...
3,2020-03-25,bold adress nation activity banned except esse...
4,2020-03-25,please understand important stay home responsi...
...,...,...
582685,2020-06-14,URL
582686,2020-06-14,covid
582687,2020-06-14,italy face two new coronavirus outbreak
582688,2020-06-14,india become top none modi reign india became ...


In [19]:
#Initialise WordCloud

stopwords = set(STOPWORDS)
stopwords.update(["USER_MENTION","URL"])  #To add any custom StopWords
wordcloud = WordCloud(background_color = "#97CAEF",stopwords = stopwords, collocations=False, mask = maskArray)

In [20]:
#Split the data into different lockdown phases, generate wordcloud and save the data

lockdown1 = cloud_df[(cloud_df['date'] >= '2020-03-25') & (cloud_df['date'] <= '2020-04-14')]
text = []
for item in lockdown1['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown1text = " ".join(string)
wordcloud.generate(lockdown1text)
wordcloud.to_file("Lockdown1 cloud.png")

lockdown2 = cloud_df[(cloud_df['date'] >= '2020-04-15') & (cloud_df['date'] <= '2020-05-03')]
text = []
for item in lockdown2['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown2text = " ".join(string)
wordcloud.generate(lockdown2text)
wordcloud.to_file("Lockdown2 cloud.png")

lockdown3 = cloud_df[(cloud_df['date'] >= '2020-05-04') & (cloud_df['date'] <= '2020-05-17')]
text = []
for item in lockdown3['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown3text = " ".join(string)
wordcloud.generate(lockdown3text)
wordcloud.to_file("Lockdown3 cloud.png")

lockdown4 = cloud_df[(cloud_df['date'] >= '2020-05-18') & (cloud_df['date'] <= '2020-05-31')]
text = []
for item in lockdown4['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown4text = " ".join(string)
wordcloud.generate(lockdown4text)
wordcloud.to_file("Lockdown4 cloud.png")

unlock1 = cloud_df[(cloud_df['date'] >= '2020-06-01') & (cloud_df['date'] <= '2020-06-14')]
text = []
for item in unlock1['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
unlock1text = " ".join(string)
wordcloud.generate(unlock1text)
wordcloud.to_file("Unlock1 cloud.png")


#General Data
text = []
for item in cloud_df['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
generaltext = " ".join(string)
wordcloud.generate(generaltext)
wordcloud.to_file("General cloud.png")

<wordcloud.wordcloud.WordCloud at 0x1d8b7991f10>