In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from wordcloud import WordCloud, STOPWORDS
import dateparser
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Next we need to load our Chat.txt into Python and read it. We will do this using the function below:

In [2]:
with open('Chat.txt', "r", encoding='utf-8') as infile:
    output_Data = { 'DateTime': [], 'Name': [], 'Content': [] }
    for line in infile:
        matches = re.match(r'^(\d{1,2})\/(\d{1,2})\/(\d\d), (24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]) - ((\S[^:]*?): )?(.*)$', line)
        if matches:
          output_Data['DateTime'].append(
            datetime(
              int(matches.group(3))+2000,
              int(matches.group(1)),
              int(matches.group(2)),
              hour=int(matches.group(4)[0:2]),
              minute=int(matches.group(4)[3:])
            ))
          output_Data['Name'].append(matches.group(6) or "{undefined}")
          output_Data['Content'].append(matches.group(7))

        elif len(output_Data['Content']) > 0:
          output_Data['Content'][-1] += "\n" + line[0:-1]


In [3]:
df = pd.DataFrame(output_Data)
df.head()

Unnamed: 0,DateTime,Name,Content
0,2019-11-15 06:31:00,{undefined},Messages to this group are now secured with en...
1,2019-11-11 09:41:00,{undefined},"Analoh RESAGRATIA created group ""RESA"""
2,2019-11-15 06:31:00,{undefined},You joined using this group's invite link
3,2019-11-15 07:06:00,AcidiQ,Good morning Fam.\nI just went through the Bud...
4,2019-11-15 07:29:00,+234 805 230 5080,Sounds great


Remove messages where Name is 'undefined' as these represent system messages.

In [4]:
print("length of df before:{}".format(len(df)))
#Remove messages where Name is 'undefined' as these represent system messages.
df = df[~df["Name"].str.contains("undefined")]
print("length of df after:{}".format(len(df)))

length of df before:1489
length of df after:1465


In [5]:
df.head(10)

Unnamed: 0,DateTime,Name,Content
3,2019-11-15 07:06:00,AcidiQ,Good morning Fam.\nI just went through the Bud...
4,2019-11-15 07:29:00,+234 805 230 5080,Sounds great
5,2019-11-15 08:09:00,Oluwatobi Williams RESAGRATIA,Pleasure to have you join us
6,2019-11-15 08:32:00,AcidiQ,Thank you Boss.\nI've accessed the Doc file. H...
7,2019-11-15 08:34:00,+234 907 199 5587,"Good morning data fellows, can anyone please s..."
8,2019-11-15 08:40:00,Oluwatobi Williams RESAGRATIA,Yes. Any MDA that is left *Open* can be worked...
9,2019-11-15 08:42:00,Oluwatobi Williams RESAGRATIA,Do you mean the function that is used to extra...
10,2019-11-15 08:42:00,AcidiQ,"Ok, great."
12,2019-11-15 09:14:00,+234 907 199 5587,"Ok, thanks"
15,2019-11-15 10:06:00,Analoh RESAGRATIA,"There's a collaboration spreadsheet available,..."


In [6]:
df[df["Content"].str.contains('\n')]

Unnamed: 0,DateTime,Name,Content
3,2019-11-15 07:06:00,AcidiQ,Good morning Fam.\nI just went through the Bud...
6,2019-11-15 08:32:00,AcidiQ,Thank you Boss.\nI've accessed the Doc file. H...
9,2019-11-15 08:42:00,Oluwatobi Williams RESAGRATIA,Do you mean the function that is used to extra...
29,2019-11-18 09:14:00,Oluwatobi Williams RESAGRATIA,"Good morning tribe,\n\nThe third tutorial is f..."
30,2019-11-18 09:17:00,Oluwatobi Williams RESAGRATIA,"If you're going through the tutorials, don't f..."
...,...,...,...
1471,2020-01-06 19:08:00,+234 806 222 9997,"Yeah, most likely. Or it depends. It depends o..."
1473,2020-01-06 19:09:00,+234 706 900 2818,This was from gapminder.com\nIt was surprising...
1475,2020-01-06 19:11:00,+234 806 222 9997,Ohh! That's good.\nMissing data comes with its...
1477,2020-01-06 21:46:00,Oluwatobi Williams RESAGRATIA,I would advise a log scale too\n\nScatter plot...


In [7]:
df["Content"] = df["Content"].replace('\n', ' ', regex=True)

In [8]:
df[df["Content"].str.contains('\n')]

Unnamed: 0,DateTime,Name,Content


 Create Columns for Date, Time, Word Count etc.

In [9]:
df['Date'] = [datetime.date(d) for d in df['DateTime']] 
df["Date"]

3       2019-11-15
4       2019-11-15
5       2019-11-15
6       2019-11-15
7       2019-11-15
           ...    
1484    2020-01-07
1485    2020-01-07
1486    2020-01-07
1487    2020-01-07
1488    2020-01-07
Name: Date, Length: 1465, dtype: object

In [10]:
df['Time'] = [datetime.time(d) for d in df['DateTime']]
df["Time"].shape

(1465,)

In [11]:
df['Hour'] = df.DateTime.dt.hour
df["Hour"]

3        7
4        7
5        8
6        8
7        8
        ..
1484     5
1485     9
1486    20
1487    20
1488    21
Name: Hour, Length: 1465, dtype: int64

In [12]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [13]:
df['Word_Count'] = df['Content'].str.count(' ') + 1
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))

In [14]:
df.reset_index(drop=True, inplace=True)

In [15]:
df.head()

Unnamed: 0,DateTime,Name,Content,Date,Time,Hour,weekday,Word_Count,Letter_Count
0,2019-11-15 07:06:00,AcidiQ,Good morning Fam. I just went through the Budg...,2019-11-15,07:06:00,7,Friday,25,150
1,2019-11-15 07:29:00,+234 805 230 5080,Sounds great,2019-11-15,07:29:00,7,Friday,2,12
2,2019-11-15 08:09:00,Oluwatobi Williams RESAGRATIA,Pleasure to have you join us,2019-11-15,08:09:00,8,Friday,6,28
3,2019-11-15 08:32:00,AcidiQ,Thank you Boss. I've accessed the Doc file. Ho...,2019-11-15,08:32:00,8,Friday,20,108
4,2019-11-15 08:34:00,+234 907 199 5587,"Good morning data fellows, can anyone please s...",2019-11-15,08:34:00,8,Friday,24,158


In [16]:
#saving to csv format
df.to_csv("WhatsappChat.csv")