In [None]:
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt

Make Initial Dataframe

In [None]:
whatsapp_df = pd.read_fwf('_chat.txt', header = None)

whatsapp_df

Remove Media

In [None]:
whatsapp_df = whatsapp_df[~whatsapp_df[0].str.contains('omitted')]
whatsapp_df

Remove Calls

In [None]:
whatsapp_df = whatsapp_df[~(whatsapp_df[0].str.contains(' started a video call') | whatsapp_df[0].str.contains(' started a call'))]
whatsapp_df

Remove Secondary Lines of Mulitline Code (Temporary Solution)

In [None]:
whatsapp_df = whatsapp_df[whatsapp_df[0].str[0] == '[']
whatsapp_df

Reorganize Columns to contain the Message, Date and Time on Different Columns while Removing the Other Columns

In [None]:
whatsapp_df[['Datetime', 'Message']] = whatsapp_df[0].str.split('] ', n = 1, expand = True)
# whatsapp_df['Datetime'] = whatsapp_df['Datetime'].str[1:]
whatsapp_df['Datetime'] = pd.to_datetime(whatsapp_df.Datetime, format='[%m/%d/%y, %I:%M:%S %p')
# whatsapp_df[['Date', 'Time']] = whatsapp_df['Datetime'].str.split(', ', n = 1, expand = True)

# whatsapp_df.drop(columns=[col for col in whatsapp_df if col not in ['Message', 'Date', 'Time']], inplace=True)
whatsapp_df.drop(columns=[col for col in whatsapp_df if col not in ['Message', 'Datetime']], inplace=True)
whatsapp_df

Add User Column, Removing All Messages Not Sent By A User

In [None]:
whatsapp_df = whatsapp_df[whatsapp_df['Message'].str.contains(':') == True]
whatsapp_df[['User', 'Message']] = whatsapp_df['Message'].str.split(': ', n = 1, expand = True)
whatsapp_df

Find the User Who Sent the Most Messages

In [None]:
user_df = whatsapp_df.groupby('User', as_index = False).count().sort_values(by = 'Message')
user_df.drop(columns = ['Datetime'], inplace = True)
user_df = user_df.rename(columns = {'User': 'User', 'Message': 'Messages Sent'})
user_df['Percentages'] = np.round(user_df['Messages Sent'].divide(other = (whatsapp_df.shape[0]/100)), 1)
user_df

Extract Group Chat Name

In [None]:
group_chat_name = whatsapp_df.iloc[0]['User'] # This is the message WhatsApp sends about end to end encryption
whatsapp_df = whatsapp_df.iloc[1:,:] # Removes the message WhatsApp sends about end to end encryption
group_chat_name

Extract Start and End Date for Data

In [None]:
start_date = whatsapp_df.iloc[0]['Datetime'].strftime('%Y-%m-%d')
end_date = whatsapp_df.iloc[-1]['Datetime'].strftime('%Y-%m-%d')
start_date, end_date

Plot This Data

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
fig.tight_layout(pad=3.0)
ax.pie(user_df['Percentages'], labels=['' for x in range(len(user_df))], autopct='%1.1f%%',
        shadow=True, startangle=90)
ax.axis('equal')
ax.set_title(f'Percentage of Messages Sent in \'{group_chat_name}\'')
plt.legend(loc="upper left", labels = user_df['User'] + ' (' + user_df['Percentages'].astype(str) + '%)'  )
plt.show()

Find the Hour the Group Chat is Most Active

In [None]:
hour_df = whatsapp_df.copy()
hour_df['Hour'] = hour_df['Datetime'].apply(lambda datetime : datetime.hour)
hour_df = hour_df.groupby('Hour', as_index = False).count()
hour_df.drop(columns = ['Datetime', 'User'], inplace = True)
hour_df = hour_df.rename(columns = {'Hour': 'Hour', 'Message': 'Messages Sent'})

Convert Hour into 12 Hour Format

In [None]:
def num_to_hour(hour):
    if hour == 0:
        return '12 AM'
    elif hour < 12:
        return f'{hour} AM'
    elif hour == 12:
        return '12 PM'
    else:
        return f'{hour-12} PM'

hour_df['Hour'] = hour_df['Hour'].apply(lambda hour: num_to_hour(hour))
hour_df

Plot This Data

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
fig.tight_layout(pad=3.0)
plt.bar(hour_df['Hour'], 
        hour_df['Messages Sent'])
plt.xlabel('Hour')
plt.ylabel('Messages Sent')
plt.title(f'Total Messages Sent per Hour on {group_chat_name} ({start_date} to {end_date})')
plt.show()

Find What Months the Chat is Most Active

In [None]:
month_df = whatsapp_df.copy()
month_df['Month'] = month_df['Datetime'].apply(lambda datetime : datetime.month)
month_df = month_df.groupby('Month', sort = False, as_index = False).count()
month_df.drop(columns = ['Datetime', 'User'], inplace = True)
month_df = month_df.rename(columns = {'Month': 'Month', 'Message': 'Messages Sent'})
num_to_month = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 
                7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
month_df['Month'] = month_df['Month'].apply(lambda num : num_to_month[num])
month_df

Plot This Data

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
fig.tight_layout(pad=3.0)
plt.plot(month_df['Month'], month_df['Messages Sent'], color='green', marker='o', linestyle='dashed', linewidth=2, markersize=12)
plt.xlabel('Month')
plt.ylabel('Messages Sent')
plt.title(f'Total Messages Sent per Month on {group_chat_name} ({start_date} to {end_date})')
plt.show()