# Whatsapp Chat Analysis

## Importing Libraries

In [2]:
import re

import numpy as np
import pandas as pd

## Chat Analysis

### Parsing Chat File

In [3]:
def isNewMessage(line: str):

    # We check for the first occurence of ':'
    first_col_index = line.find(':')

    # If ':' doesn't occur at all in the line, it's not a new message
    if first_col_index < 0:
        return [False, 0]
    
    # We check for the second occurence of ':'
    second_col_index = line.find(':', first_col_index+1) # The index of the first occurence of ':'

    # If ':' doesn't occur a second time in the line, it's not a new messages
    if second_col_index < 0:
        return [False, 0]

    # This matches the start of the line of new messages
    expression = '^(([0-9]|([0-9][0-9]))/([0-9]|([0-9][0-9]))/[0-9][0-9], (([0-9]|([0-9][0-9])):[0-9][0-9] (AM|PM)) - .*:)'

    # If the beginning of the line matches the exprsssion above, then it is a new message
    x = re.findall(expression, line[:second_col_index+1])

    if x:
        return [True, second_col_index]
    else: 
        return [False, 0]

In [4]:
def parseMessage(line: str, second_col_index) -> dict:
    # We extract the date, time, person and message based on the structure of lines in txt
    metadata = line[:second_col_index]
    message = line[second_col_index+2:]

    split1 = metadata.split(" - ")
    person = split1[1]

    split2 = split1[0].split(", ")
    date = split2[0]
    time = split2[1]

    return {'Date': date, 'Time': time, 'Person': person, 'Message': message}

In [5]:
chat_file = open('chat_st.txt', 'rt', encoding="utf8")

chat_df = pd.DataFrame(columns=['Date', 'Time', 'Person', 'Message'])

# rows will contain all the messages of the chat.
rows = []

lines = chat_file.readlines()

# Hold the current message when a message is spanning several lines
prevMessage: dict = {}

for line in lines[1:]:
    result = isNewMessage(line)

    isNew = result[0]
    col_index = result[1]

    # If the line is a new message
    # We save the previous message to the rows variable (list of messages)
    # and the new message becomes the previous message.
    if isNew:
        rows.append(prevMessage.values())
        prevMessage = parseMessage(line, col_index)
    else:
        # If the line is not a new message, that means it is part of the text of the previous message
        prevMessage['Message'] = prevMessage['Message'] + line

# Don't forget to add the last message to rows
rows.append(prevMessage.values())

# Create our chat dataframe for analysis
chat_df = pd.DataFrame(rows, columns=['Date', 'Time', 'Person', 'Message'])

chat_file.close()

In [6]:
# Ignore the first row
chat_df = chat_df.loc[1:]

chat_df = chat_df.reset_index(drop=True)

### Chat Feature Engineering

In [7]:
people = chat_df['Person'].unique()

In [8]:
people

array(['Babila Mutia', 'ST'], dtype=object)

#### Date Features

In [9]:
chat_df['Date'] = pd.to_datetime(chat_df['Date'])

In [10]:
chat_df['DayOfWeek'] = chat_df['Date'].dt.day_name()

In [11]:
chat_df['MonthName'] = chat_df['Date'].dt.month_name()

In [12]:
chat_df['Year'] = chat_df['Date'].dt.year

#### Number of words

In [13]:
chat_no_media_df = chat_df.copy(deep=True)

In [14]:
chat_no_media_df = chat_no_media_df[chat_no_media_df['Message'] != '<Media omitted>\n']

In [15]:
chat_no_media_df['Words'] = chat_no_media_df['Message'].str.split()

In [16]:
lists = chat_no_media_df['Words'].tolist()

# For each message, we count its number of words
counts = [len(item) for item in lists]

# We add the new column for the number of words
chat_no_media_df['NumWords'] = counts

In [17]:
chat_no_media_df

Unnamed: 0,Date,Time,Person,Message,DayOfWeek,MonthName,Year,Words,NumWords
0,2021-10-11,5:16 PM,Babila Mutia,I'm back 🌚\n,Monday,October,2021,"[I'm, back, 🌚]",3
1,2021-10-11,5:22 PM,ST,😂😂😂😂😂😂😂\n,Monday,October,2021,[😂😂😂😂😂😂😂],1
2,2021-10-11,5:22 PM,ST,You repaired your phone?\n,Monday,October,2021,"[You, repaired, your, phone?]",4
3,2021-10-11,5:22 PM,Babila Mutia,No\n,Monday,October,2021,[No],1
4,2021-10-11,5:22 PM,Babila Mutia,I couldn't the screen was too expensive\n,Monday,October,2021,"[I, couldn't, the, screen, was, too, expensive]",7
...,...,...,...,...,...,...,...,...,...
7265,2022-10-14,4:29 PM,ST,Eh?\n,Friday,October,2022,[Eh?],1
7266,2022-10-14,4:29 PM,ST,They took the one dollar?\n,Friday,October,2022,"[They, took, the, one, dollar?]",5
7267,2022-10-14,4:29 PM,ST,Let me try and check\n,Friday,October,2022,"[Let, me, try, and, check]",5
7268,2022-10-15,1:18 AM,Babila Mutia,Okay\n,Saturday,October,2022,[Okay],1


### Chat Indicator Calculations

In [18]:
num_messages = {person: len(chat_df[chat_df['Person'] == person]) for person in people}
num_messages['total'] = len(chat_df)
num_messages

{'Babila Mutia': 3874, 'ST': 3396, 'total': 7270}

In [19]:
num_messages_nomedia = {person: len(chat_no_media_df[chat_no_media_df['Person'] == person]) for person in people}
num_messages_nomedia['total'] = len(chat_no_media_df)
num_messages_nomedia

{'Babila Mutia': 3338, 'ST': 2808, 'total': 6146}

In [20]:
num_words = {person: chat_no_media_df[chat_no_media_df['Person'] == person]['NumWords'].sum() for person in people}
num_words['total'] = chat_no_media_df['NumWords'].sum()
num_words

{'Babila Mutia': 17289, 'ST': 18481, 'total': 35770}

In [21]:
num_words_per_message = {person: 0 for person in people}

for person in chat_no_media_df['Person'].unique():
    num_words_per_message[person] = num_words[person] / num_messages_nomedia[person]

num_words_per_message['total'] = num_words['total'] / num_messages_nomedia['total']

num_words_per_message

{'Babila Mutia': 5.179448771719593,
 'ST': 6.581552706552706,
 'total': 5.82004555808656}

In [22]:
num_media = {person: 0 for person in people}

for person in chat_no_media_df['Person'].unique():
    num_media[person] = num_messages[person] - num_messages_nomedia[person]

num_media['total'] = num_messages['total'] - num_messages_nomedia['total']

num_media

{'Babila Mutia': 536, 'ST': 588, 'total': 1124}

In [23]:
chat_df.groupby(by='DayOfWeek', as_index=False).count().sort_values(by='Date', ascending=False)

Unnamed: 0,DayOfWeek,Date,Time,Person,Message,MonthName,Year
5,Tuesday,1381,1381,1381,1381,1381,1381
4,Thursday,1372,1372,1372,1372,1372,1372
1,Monday,1316,1316,1316,1316,1316,1316
6,Wednesday,1147,1147,1147,1147,1147,1147
0,Friday,938,938,938,938,938,938
3,Sunday,583,583,583,583,583,583
2,Saturday,533,533,533,533,533,533


In [24]:
print(people[0])

chat_df[chat_df['Person'] == people[0]].groupby(by='DayOfWeek', as_index=False).count().sort_values(by='Date', ascending=False)

Babila Mutia


Unnamed: 0,DayOfWeek,Date,Time,Person,Message,MonthName,Year
4,Thursday,722,722,722,722,722,722
5,Tuesday,722,722,722,722,722,722
1,Monday,683,683,683,683,683,683
6,Wednesday,618,618,618,618,618,618
0,Friday,522,522,522,522,522,522
3,Sunday,316,316,316,316,316,316
2,Saturday,291,291,291,291,291,291


In [25]:
print(people[1])

chat_df[chat_df['Person'] == people[1]].groupby(by='DayOfWeek', as_index=False).count().sort_values(by='Date', ascending=False)

ST


Unnamed: 0,DayOfWeek,Date,Time,Person,Message,MonthName,Year
5,Tuesday,659,659,659,659,659,659
4,Thursday,650,650,650,650,650,650
1,Monday,633,633,633,633,633,633
6,Wednesday,529,529,529,529,529,529
0,Friday,416,416,416,416,416,416
3,Sunday,267,267,267,267,267,267
2,Saturday,242,242,242,242,242,242


In [26]:
chat_df.groupby(by=['Year', 'MonthName'], as_index=False).count().sort_values(by='Date', ascending=False)

Unnamed: 0,Year,MonthName,Date,Time,Person,Message,DayOfWeek
2,2021,October,983,983,983,983,983
5,2022,February,815,815,815,815,815
1,2021,November,800,800,800,800,800
0,2021,December,775,775,775,775,775
6,2022,January,611,611,611,611,611
9,2022,March,556,556,556,556,556
10,2022,May,521,521,521,521,521
12,2022,September,463,463,463,463,463
4,2022,August,453,453,453,453,453
3,2022,April,430,430,430,430,430


In [47]:

vocabulary = {}

for person in people:
    vocabulary[person] = {}
    

In [48]:
from typing import List


def stripWord(chars: List[str], word: str):
    for char in chars:
        word = word.replace(char, '')

    return word

In [49]:
def countWords(row):
    for word in row['Words']:
        word = stripWord({',','!','?',':',';','.'}, word)
        if word in vocabulary[row['Person']]:
            vocabulary[row['Person']][word] += 1
        else:
            vocabulary[row['Person']][word] = 1
        

In [50]:
chat_no_media_df.apply(countWords, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
7265    None
7266    None
7267    None
7268    None
7269    None
Length: 6146, dtype: object

In [51]:
vocabulary

{'Babila Mutia': {"I'm": 86,
  'back': 15,
  '🌚': 253,
  'No': 55,
  'I': 275,
  "couldn't": 5,
  'the': 535,
  'screen': 4,
  'was': 211,
  'too': 41,
  'expensive': 2,
  'it': 293,
  'same': 21,
  'price': 1,
  'as': 20,
  'new': 14,
  'phone': 9,
  'just': 146,
  'bought': 7,
  'one': 52,
  'It': 24,
  '150k': 1,
  'A': 3,
  'samsung': 1,
  's10': 1,
  '😂😂😂😂': 3,
  'ha': 29,
  'ah*': 1,
  'tsiups': 136,
  '😂😂😂': 1,
  'Well': 7,
  "it's": 158,
  'not': 77,
  'Yes': 74,
  'But': 74,
  'they': 141,
  'have': 106,
  'nt': 87,
  'yet': 15,
  'started': 17,
  'paying': 2,
  'me': 74,
  'saved': 3,
  'up': 15,
  'money': 12,
  'Its': 8,
  '200k': 1,
  'for': 87,
  'now': 23,
  'yes': 145,
  'still': 32,
  'working': 2,
  'there': 62,
  'lost': 4,
  'all': 60,
  'my': 31,
  'stickers': 1,
  '😭😭😭😭': 2,
  'tsuips': 1,
  "You'll": 10,
  'send': 16,
  'of': 146,
  'yours': 1,
  'n': 1,
  '😹😹😹😹': 210,
  'why': 74,
  'are': 107,
  'people': 47,
  'like': 125,
  'this': 49,
  '😹': 505,
  'rien': 1

In [60]:
# wordCount_df = pd.DataFrame.from_dict({(i,j): vocabulary[i][j] 
#                            for i in vocabulary.keys() 
#                            for j in vocabulary[i].keys()},
#                        orient='index')

# wordCount_df = pd.DataFrame.from_dict(vocabulary, orient='index')
series = [[person, word, vocabulary[person][word]] for person in vocabulary.keys() for word in vocabulary[person].keys()]


wordCount_df = pd.DataFrame(series, columns=['Person', 'Word', 'Count'])
wordCount_df

Unnamed: 0,Person,Word,Count
0,Babila Mutia,I'm,86
1,Babila Mutia,back,15
2,Babila Mutia,🌚,253
3,Babila Mutia,No,55
4,Babila Mutia,I,275
...,...,...,...
6109,ST,username,1
6110,ST,underage,1
6111,ST,receive,1
6112,ST,bank,1


In [68]:
wordCount_df[wordCount_df['Person'] == people[0]].sort_values(by='Count', ascending=False).reset_index(drop=True).iloc[0:25]

Unnamed: 0,Person,Word,Count
0,Babila Mutia,the,535
1,Babila Mutia,😹,505
2,Babila Mutia,😹😹😹,362
3,Babila Mutia,😹😹,348
4,Babila Mutia,it,293
5,Babila Mutia,I,275
6,Babila Mutia,that,265
7,Babila Mutia,to,261
8,Babila Mutia,🌚,253
9,Babila Mutia,you,235


In [69]:
wordCount_df[wordCount_df['Person'] == people[1]].sort_values(by='Count', ascending=False).reset_index(drop=True).iloc[0:25]

Unnamed: 0,Person,Word,Count
0,ST,the,627
1,ST,I,349
2,ST,to,325
3,ST,it,266
4,ST,a,264
5,ST,you,259
6,ST,was,250
7,ST,that,247
8,ST,is,227
9,ST,and,203
