# Bar chart race Generator
With this notebook you can generate some bar chart race of the most common words in prevously collected tweets.
Data are taken from a csv file, cleaned and processed

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import datetime

### Loading the data

In [2]:
csvPath = './Data/tweetsUS.csv' #input Path
header_list = ["Id", "Date", "Lang", "Text"] #csv headers
tweets = pd.read_csv(csvPath, names=header_list)
tweets.head()

Unnamed: 0,Id,Date,Lang,Text
0,1227372076505264128,2020-02-11 23:21:21+00:00,en,b'UPDATE: 3-year-old Annabel Wucinski was take...
1,1227670752570347522,2020-02-12 19:08:11+00:00,en,b'Humanity must fight #COVID-19 AS ONE/STOP sa...
2,1228069539751092226,2020-02-13 21:32:49+00:00,en,b'@SenTomCotton For clarity the official name ...
3,1227824948699377664,2020-02-13 05:20:54+00:00,en,b'Interview w/ @CGTNOfficial on difficulties i...
4,1227757079282081793,2020-02-13 00:51:12+00:00,en,b'Coronavirus: Up To 24 Days Before Symptoms S...


### Selecting the wanted time period

A function to select a slice of a dataframe of a certain time period is defined. This will be useful later

In [3]:
def SliceAt(df, start_date = '2020-04-15', end_date = '2021-04-20', col = 'Date'):
    mask = (df[col] >= start_date) & (df[col] < end_date)
    return df.loc[mask]

### Cleaning the text from Social media stuff

In [4]:
def CleanText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) #Remove tags
    text = re.sub(r'RT[\s]+', '', text) #remove ReTweets
    text = re.sub(r'https?:\/\/\S+', '', text) #remove links
    text = re.sub(r'\\\S+', '', text) #Remove emojies and sp chars
    text = re.sub(r'#', '', text) #Remove emojies and sp chars
    text = re.sub(r'b\'', '', text) #remove the "b'"
    return text

In [5]:
tweets['ParsedText'] = tweets['Text'].apply(CleanText).apply(lambda x: x.lower())
tweets.head()

Unnamed: 0,Id,Date,Lang,Text,ParsedText
0,1227372076505264128,2020-02-11 23:21:21+00:00,en,b'UPDATE: 3-year-old Annabel Wucinski was take...,update: 3-year-old annabel wucinski was taken ...
1,1227670752570347522,2020-02-12 19:08:11+00:00,en,b'Humanity must fight #COVID-19 AS ONE/STOP sa...,humanity must fight covid-19 as one/stop sayin...
2,1228069539751092226,2020-02-13 21:32:49+00:00,en,b'@SenTomCotton For clarity the official name ...,for clarity the official name is covid-19
3,1227824948699377664,2020-02-13 05:20:54+00:00,en,b'Interview w/ @CGTNOfficial on difficulties i...,interview w/ on difficulties in fighting agai...
4,1227757079282081793,2020-02-13 00:51:12+00:00,en,b'Coronavirus: Up To 24 Days Before Symptoms S...,coronavirus: up to 24 days before symptoms sta...


### Removing stopwords to avoid trivial results

In [6]:
import pandas as pd
import nltk.corpus
nltk.download('stopwords') #download most common stopwords
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ginef\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
custom_stopwords = ['b', 'covid', 'pandemic', 'coronavirus', 'corona', 'covid19', 'amp', 'covid_19', '19']
stop_words = stopwords.words('english') + custom_stopwords

Word = re.compile(r'\w+')

def Check(x):
    words = Word.findall(x)
    outPut = ''
    for word in words:
        if not word.lower() in stop_words:
            outPut += ' ' + word
    return outPut

tweets['CleanedText'] = tweets['ParsedText'].apply(Check)
tweets.head()

Unnamed: 0,Id,Date,Lang,Text,ParsedText,CleanedText
0,1227372076505264128,2020-02-11 23:21:21+00:00,en,b'UPDATE: 3-year-old Annabel Wucinski was take...,update: 3-year-old annabel wucinski was taken ...,update 3 year old annabel wucinski taken back...
1,1227670752570347522,2020-02-12 19:08:11+00:00,en,b'Humanity must fight #COVID-19 AS ONE/STOP sa...,humanity must fight covid-19 as one/stop sayin...,humanity must fight one stop saying china wuh...
2,1228069539751092226,2020-02-13 21:32:49+00:00,en,b'@SenTomCotton For clarity the official name ...,for clarity the official name is covid-19,clarity official name
3,1227824948699377664,2020-02-13 05:20:54+00:00,en,b'Interview w/ @CGTNOfficial on difficulties i...,interview w/ on difficulties in fighting agai...,interview w difficulties fighting 20 mins con...
4,1227757079282081793,2020-02-13 00:51:12+00:00,en,b'Coronavirus: Up To 24 Days Before Symptoms S...,coronavirus: up to 24 days before symptoms sta...,24 days symptoms start showing via outofcontr...


### Counting the frequency of words in a dataframe [Function]

In [8]:
def countWords(df, col = 'CleanedText', length = 100):
    wordCount = df[col].str.split(expand=True).stack().value_counts().reset_index().head(length)
    wordCount.columns = ['Word', 'Frequency'] 
    return wordCount
    
countWords(tweets).head()

Unnamed: 0,Word,Frequency
0,people,6331
1,get,6267
2,new,4730
3,like,4603
4,us,4570


### Generating the proper dataset with pivoting

In [9]:
start_date = datetime.date(2020, 4, 15)
end_date = datetime.date(2021, 4, 15)
timeDelta = datetime.timedelta(days = 1)

barNum = 20
col = 'CleanedText'

Iterate over each time period, slice the dataframe, count the words.

In [10]:
currentDate = start_date
wordCount = countWords(tweets)

while currentDate + timeDelta < end_date:
    twSlice = SliceAt(tweets, currentDate.strftime('%Y-%m-%d'), (currentDate + timeDelta).strftime('%Y-%m-%d'))
    CountedWords = countWords(twSlice, col=col)
    CountedWords['Date'] = currentDate.strftime('%Y-%m-%d')
    wordCount = pd.concat([wordCount, CountedWords], ignore_index=True)
    currentDate += timeDelta

wordCount = wordCount.pivot_table(values='Frequency', index= ['Date'], columns=['Word'])
wordCount = wordCount.fillna(0)
wordCount.head()


Word,0,00,000,1,10,100,1000s,100k,10am,10k,...,yr,yrs,yuma,zealand,zep,zero,zerobail,zinc,zoo,zoom
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-15,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-16,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-17,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-18,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Aggregate the datset in order to plot the integral of all the word ever tweeted up to the current date

In [11]:
wordCount.iloc[:, 0:-1] = wordCount.iloc[:, 0:-1].cumsum()
wordCount.head()

Word,0,00,000,1,10,100,1000s,100k,10am,10k,...,yr,yrs,yuma,zealand,zep,zero,zerobail,zinc,zoo,zoom
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-15,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-16,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-17,0.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-18,0.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-19,0.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Avoid overcrowding and resource waste by salecting only the most relevant words

In [12]:
topWords = set()

for index, row in wordCount.iterrows():
    topWords |= set(row[row > 0].sort_values(ascending=False).head(barNum).index)

wordCount = wordCount[list(topWords)]
wordCount.head()

Word,quarantine,back,new,us,home,empty,masks,time,day,stayhome,...,hero,get,still,challenge,help,many,mask,health,people,trump
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-15,32.0,9.0,23.0,19.0,11.0,0.0,30.0,14.0,18.0,25.0,...,0.0,27.0,0.0,0.0,14.0,10.0,33.0,8.0,23.0,47.0
2020-04-16,70.0,20.0,62.0,49.0,31.0,0.0,41.0,42.0,36.0,48.0,...,0.0,44.0,0.0,0.0,36.0,21.0,54.0,19.0,50.0,55.0
2020-04-17,111.0,33.0,80.0,63.0,43.0,0.0,49.0,73.0,58.0,63.0,...,0.0,72.0,0.0,0.0,52.0,31.0,71.0,26.0,76.0,68.0
2020-04-18,145.0,47.0,104.0,94.0,71.0,9.0,62.0,108.0,75.0,96.0,...,0.0,97.0,17.0,10.0,63.0,49.0,71.0,26.0,104.0,100.0
2020-04-19,187.0,47.0,139.0,115.0,91.0,38.0,62.0,134.0,94.0,113.0,...,24.0,112.0,17.0,36.0,84.0,61.0,89.0,26.0,128.0,155.0


### Generating the BarChartRace

In [13]:
import bar_chart_race as bcr

In [15]:
bcr.bar_chart_race(df = wordCount,
                   filename = 'words.mp4',
                   orientation='h',
                   sort='desc',
                   n_bars = barNum,

                   steps_per_period= 10,
                   period_length = 200,

                   bar_size=0.95,
                   figsize=(16, 9),
                   #cmap= 'Pastel1',

                   title='Frequency of words in tweets',
                   title_size = 20,
                   bar_label_size=15,
                   tick_label_size=15,
                   dpi= 144,
                   bar_kwargs={
                       'alpha': 1
                   },
                   period_label={
                       'x': 0.95,
                       'y': 0.15,
                       'va': 'center',
                       'ha': 'right',
                       'size': 40,
                       'weight': 'bold'
                   }
                   )

  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
