In [None]:
import pandas as pd
import numpy as np
import re
import dateparser
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings''' 
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

In [None]:
chat = read_file("WhatsApp Chat with Funsters.txt")
len(chat)

In [None]:
join = [line for line in chat if  "joined using this" in line]
join

In [None]:
 #Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))

In [None]:
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]
clean_chat = [line for line in clean_chat if not "added" in line]
print(len(clean_chat))

In [None]:
#Further cleaning
#Remove empty lines
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat))

In [None]:
#Drop 'left-ers'
left = [line for line in clean_chat if line.endswith("left")]
left

In [None]:
#Drop 'contacts attached'
clean_chat = [line for line in clean_chat if not "vcf" in line]
print(len(clean_chat))

In [None]:
#Clean out the left notification lines
clean_chat = [line for line in clean_chat if not line.endswith("left")]
print(len(clean_chat))

In [None]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
len(msgs) 

In [None]:
msgs = msgs[2:]
msgs[0:10]

In [None]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time] # Remove spacing
print("length of time is:")
print(len(time))
print(time[0])

In [None]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
print(len(date))
print(date[0])

In [None]:
name = []
for i in range(len(msgs)):
  try:
    name.append(msgs[i].split('-')[1].split(':')[0].strip())
  except IndexError:
    name.append('Missing Name')
print(len(name))
print(name[0]) 

In [None]:
content = []
for i in range(len(msgs)):
  try:
    content.append(msgs[i].split(':')[2])
  except IndexError:
    content.append('Missing Text')
print(len(content))
print(content[0])

In [None]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])
df.head()

In [None]:
df = df[(df["Content"]!='Missing Text') & (df['Name']!='Missing Name')]
df.reset_index(inplace=True, drop=True)
df.head()

In [None]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

In [None]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [None]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

In [None]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 
# The first token of a value in the Time Column contains the hour (Eg., "12" in "12:15")

In [None]:
#print first five rows of our dataframe
from datetime import datetime
df['Date'] = df['DateTime'].dt.date
df['Date'] = pd.to_datetime(df.Date)
df.info()

In [None]:
df_date = df.groupby(['Date','Name'])['Letter_Count'].sum().reset_index()
df_date[:10]

In [None]:
df_date.info()

In [None]:
#freq = df.groupby(['Date','Name'])['Name'].count()
df['Frequency']=df.groupby(['Date','Name']).Name.transform('count')

In [None]:
df_new = df[['Date','Name','Frequency']].drop_duplicates()

In [None]:
dff = df_date.loc[df['Date']=='2019-02-04']
dff

In [None]:
dff_02_04_2019_top = dff.Name.value_counts(ascending=False).reset_index()
dff_02_04_2019_top.columns=['Name','Frequency']
dff_02_04_2019_top

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
ax.barh(dff_02_04_2019_top['Name'], dff_02_04_2019_top['Frequency'])

In [None]:
colors = ['#adb0ff', '#ffb3ff', '#90d595', '#e48381','#aafbff', '#f7bb5f', '#eafb50']
name = list(dff_02_04_2019_top.Name.unique())
name

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
dff = dff_02_04_2019_top[::-1]   # flip values from top to bottom
# pass colors values to `color=`
#dff_02_04_2019_top.plot.barh(color=colors)
ax.barh(dff['Name'], dff['Frequency'], color=[colors[i] for i in range(7)])
# iterate over the values to plot labels and values (Tokyo, Asia, 38194.2)
for i, (value,name) in enumerate(zip(dff_02_04_2019_top['Frequency'],dff_02_04_2019_top['Name'])):
    ax.text(value, i,     name,            ha='right')  # Tokyo: name
    #ax.text(value, i-.25, group_lk[name],  ha='right')  # Asia: group name
    ax.text(value, i,     value,           ha='left')   # 38194.2: value
# Add year right middle portion of canvas
ax.text(1, 0.4, '2019-02-04', transform=ax.transAxes, size=24, ha='right')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(date):
    
    dff = df_new[df_new['Date'].eq(date)].sort_values(by='Frequency', ascending=True).tail(10)
    
    ax.clear()
    ax.barh(dff['Name'], dff['Frequency'], color=[colors[i] for i in range(7)])
    #dx = dff['value'].max() / 200
    for i, (value, name) in enumerate(zip(dff['Frequency'], dff['Name'])):
        ax.text(value, i,name, ha='right', va='bottom')
        #ax.text(value-dx, i-.25, group_lk[name], size=10, color='#444444', ha='right', va='baseline')
        ax.text(value, i,value,ha='left', va='center')
    # ... polished styles
    ax.text(1, 0.4, str(date).split('T')[0], transform=ax.transAxes, color='#777777', size=30, ha='right', weight=800)
    ax.text(0, 1.06, 'Frequency of Messages', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Top 10 Users that sent more messages',transform=ax.transAxes, size=24, weight=600, ha='left')
    plt.box(False)
draw_barchart('2019-02-04')

In [None]:
#dates_display = list(df_new['Date'].dt.date.unique())
dates = list(df_new['Date'].unique())
dates[:5]

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=dates)
#HTML(animator.to_jshtml())
animator.save('video.mp4') 