In [None]:
pip install plotly

In [None]:
import re
import pandas as pd
import os
import datetime as date
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
from collections import Counter
import plotly.express as px

In [None]:
pip install --upgrade Pillow

In [None]:
pip install wordcloud==1.8.0

In [None]:
pip install emoji==1.6.3

In [None]:
chat_path='_chat.txt'
data=pd.read_fwf(chat_path)

In [None]:
data.head()

In [None]:
print(f"Total messages: {data.shape[0]}")

In [None]:
def textTo_df(txt):
    with open(txt,encoding="utf-8") as file:
        data=file.read()
        data=data.splitlines()
        
    author=[]
    message=[]
    datetime=[]
    
    for row in data:
        # timestamp before first space
        try:
            str_date=re.findall(r'\[.*?\]',row)[0]
            datetime.append(str(str_date)[1:-1].strip())
        except:
            datetime.append('')
        try:
            #author between am/pm, dash and colon
            s=re.search('] (.*?):',row).group(1)
            author.append(s)
        except:
            author.append('')
            
        try:
            #message after first colon
            message.append(row.split(': ',1)[1])
        except:
            message.append('')
    df=pd.DataFrame(zip(datetime,author,message),columns=['datetime','author','message'])
    df=df[df.author!=''].reset_index(drop=True)
    return df
whatsapp_df=textTo_df(chat_path)

In [None]:
whatsapp_df['datetime']=pd.to_datetime(whatsapp_df.datetime,format='%d/%m/%y, %I:%M:%S %p')

In [None]:
whatsapp_df['Word_Count']=whatsapp_df["message"].apply(lambda s: len(s.split(' ')))

In [None]:
whatsapp_df.head(15)

In [None]:
#Cleaning media from data

In [None]:
media=whatsapp_df[whatsapp_df['message']=="<Media omitted"]
whatsapp_df.drop(media.index,inplace=True)

In [None]:
whatsapp_df.reset_index(inplace=True,drop=True)
print(whatsapp_df.shape)

In [None]:
#All authors in the chat
author=whatsapp_df.author.unique()
print('Authors: ',author)

In [None]:
#Timeline of the data
start_date=whatsapp_df.datetime.min()
end_date=whatsapp_df.datetime.max()

diff=end_date-start_date
print('Timeline- from {} till {} i.e., approx. {} hours'.format(start_date,end_date,diff))

In [None]:
#Average messages per day
diff.to_timedelta64()
avg=whatsapp_df.shape[0]/diff.days
print("Average {} messages per day".format(int(avg)))

In [None]:
#Most active time
whatsapp_df_copy=whatsapp_df.copy()
whatsapp_df_copy["hours"]=whatsapp_df['datetime'].apply(lambda x:x.hour)
times_df=whatsapp_df_copy.groupby('hours').count().reset_index().sort_values(by='hours')

In [None]:
matplotlib.rcParams['font.size']=15
matplotlib.rcParams['figure.figsize']=(20,10)
sns.set_style('darkgrid')
plt.title('Most activity hour')
time_plot=sns.barplot(x="hours",y="message",data=times_df,dodge=False)
labels=["12 AM","1 AM","2 AM","3 AM","4 AM","5 AM","6 AM","7 AM","8 AM","9 AM","10 AM","11 AM","12 PM","1 PM","2 PM","3 PM","4 PM","5 PM","6 PM","7 PM","8 PM","9 PM","10 PM","11 PM"]
plt.xticks([i for i in range(24)],labels=labels,rotation=30)
plt.show()

In [None]:
# most used emoji
whatsapp_df_copy_2=whatsapp_df.copy()
emoji_count=Counter()
emoji_list=list(emoji.UNICODE_EMOJI_ENGLISH.keys())
r=re.compile('|'.join(re.escape(p) for p in emoji_list))
#print(r)
for idx,row in whatsapp_df_copy_2.iterrows():
    found=r.findall(row["message"])
    for emojii in found:
        emoji_count[emojii]+=1

In [None]:
emoji_df=pd.DataFrame()
emoji_df['emoji']=['']*10
emoji_df['count_emojis']=[0]*10

i=0
for item in emoji_count.most_common(10):
    emoji_df.emoji[i]=item[0]
    emoji_df.count_emojis[i]=int(item[1])
    i+=1

In [None]:
emoji_df

In [None]:
#Author wise stats
whatsapp_data=whatsapp_df.copy()
l=whatsapp_data.author.unique()
emoji_dict={}
for i in range(len(l)):
    req_df= whatsapp_data[whatsapp_data["author"] == l[i]]
    print(f'Stats of {l[i]} -')
    print('Messages Sent', req_df.shape[0])
    words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
    print('Words per message', words_per_message)

In [None]:
#wordcloud
from wordcloud import WordCloud, STOPWORDS

In [None]:
whatsapp_df_copy_3=whatsapp_df.copy()
stopwords=set(STOPWORDS)
word=" ".join(review for review in whatsapp_df_copy_3.message)

wordcloud=WordCloud(width=500, height=500,stopwords=stopwords, background_color="white",min_font_size=10).generate(word)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
whatsapp_df_copy_4=whatsapp_df.copy()
df = pd.DataFrame(whatsapp_df_copy_4, columns=["DateTime", 'Author', 'Message'])
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()
whatsapp_df_copy_4["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in whatsapp_df_copy_4["message"]]
whatsapp_df_copy_4["Negative"]=[sentiments.polarity_scores(i)["neg"] for i in whatsapp_df_copy_4["message"]]
whatsapp_df_copy_4["Neutral"]=[sentiments.polarity_scores(i)["neu"] for i in whatsapp_df_copy_4["message"]]
print(whatsapp_df_copy_4.head())

In [None]:
x=sum(whatsapp_df_copy_4["Positive"])
y=sum(whatsapp_df_copy_4["Negative"])
z=sum(whatsapp_df_copy_4["Neutral"])

def sentiment_score(a,b,c):
    if(a>b) and (a>c):
        print("Positive")
    elif (b>a) and (b>c):
        print("Negative")
    else:
        print("Neutral")
print("Overall sentiment is :")
sentiment_score(x, y, z)