In [17]:
import pandas as pd
import numpy as np

In [18]:
import findspark
findspark.init()

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, udf, lit

spark = SparkSession.builder.appName('read csv files').getOrCreate()

In [20]:
csv_df=spark.read.csv("AprTopics.csv",header='true')

In [21]:
apr_topics=csv_df.toPandas()

In [22]:
apr_topics.head()

Unnamed: 0,created_date,dominant_topic,dominant_topic_keywords,count
0,2021-04-16,topic1,"sun, view, collection, food, sat, women, full,...",62
1,2021-04-15,topic2,"edwardbarber, like, love, edward, barber, day,...",612
2,2021-04-26,topic2,"edwardbarber, like, love, edward, barber, day,...",491
3,2021-04-01,topic0,"whatshappeninginmyanmar, aprcoup, posted, phot...",367
4,2021-04-29,topic5,"free, zerowaste, part, different, cool, future...",45


In [23]:
apr_topics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   created_date             180 non-null    object
 1   dominant_topic           180 non-null    object
 2   dominant_topic_keywords  180 non-null    object
 3   count                    180 non-null    object
dtypes: object(4)
memory usage: 5.8+ KB


In [24]:
apr_topics['created_date']=pd.to_datetime(apr_topics['created_date'])
#apr_topics['topic']=pd.to_datetime(apr_topics['topic'])
apr_topics['count']=pd.to_numeric(apr_topics['count'])

Transform the Data

In [25]:
df = apr_topics.pivot_table(values = 'count', index = ['created_date'], columns = 'dominant_topic_keywords')
df.head()

dominant_topic_keywords,"btc, cute, chef, crypto, pretty, greed, april, silver, bistro, good","edwardbarber, like, love, edward, barber, day, good, get, happy, life","free, zerowaste, part, different, cool, future, nftcollectors, nftcommunity, city, miss","like, people, amp, see, need, please, get, know, help, really","sun, view, collection, food, sat, women, full, field, available, interpol","whatshappeninginmyanmar, aprcoup, posted, photo, military, terrorists, junta, abducted, myanmar, amp"
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-01,80,568,89,399,59,367
2021-04-02,83,649,123,327,63,401
2021-04-03,79,732,137,359,49,426
2021-04-04,85,784,111,351,65,405
2021-04-05,91,555,88,361,48,256


Replace NaN Values

In [26]:
df.fillna(0, inplace=True)
df.sort_values(list(df.columns),inplace=True)
df = df.sort_index()


In [27]:
df.head()

dominant_topic_keywords,"btc, cute, chef, crypto, pretty, greed, april, silver, bistro, good","edwardbarber, like, love, edward, barber, day, good, get, happy, life","free, zerowaste, part, different, cool, future, nftcollectors, nftcommunity, city, miss","like, people, amp, see, need, please, get, know, help, really","sun, view, collection, food, sat, women, full, field, available, interpol","whatshappeninginmyanmar, aprcoup, posted, photo, military, terrorists, junta, abducted, myanmar, amp"
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-01,80,568,89,399,59,367
2021-04-02,83,649,123,327,63,401
2021-04-03,79,732,137,359,49,426
2021-04-04,85,784,111,351,65,405
2021-04-05,91,555,88,361,48,256


Aggregate and clean data

In [28]:
df.iloc[:, 0:] = df.iloc[:, 0:].cumsum()
df.head()

dominant_topic_keywords,"btc, cute, chef, crypto, pretty, greed, april, silver, bistro, good","edwardbarber, like, love, edward, barber, day, good, get, happy, life","free, zerowaste, part, different, cool, future, nftcollectors, nftcommunity, city, miss","like, people, amp, see, need, please, get, know, help, really","sun, view, collection, food, sat, women, full, field, available, interpol","whatshappeninginmyanmar, aprcoup, posted, photo, military, terrorists, junta, abducted, myanmar, amp"
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-01,80,568,89,399,59,367
2021-04-02,163,1217,212,726,122,768
2021-04-03,242,1949,349,1085,171,1194
2021-04-04,327,2733,460,1436,236,1599
2021-04-05,418,3288,548,1797,284,1855


In [29]:
#!pip install bar_chart_race

In [30]:
import bar_chart_race as bcr
bcr.__version__

'0.1.0'

In [18]:
bcr.bar_chart_race(df = df, 
                   n_bars = 6, 
                   sort='desc',
                   title='tweets number under each topic in Apr',
                  filename='Apr_tweet.mp4')

In [32]:
bcr.bar_chart_race(df = df, 
                   n_bars = 6, 
                   sort='desc',
                   title='tweets number under each topic in Apr',
                  filename=None)