
# WhatsApp and Telegram Chat Analysis.

It visualizes the most used words, hours texted, general sentiments by the other party.

# Files needed:

1. Exported Telegram Chat JSON file. 
	[How to export Telegram chat](https://www.maketecheasier.com/export-telegram-chat-history/)

2. Exported WhatsApp Chat text file.
	[How to export WhatsApp chat](https://www.marca.com/en/lifestyle/how-to/2021/11/10/618bbe63ca4741b2138b4608.html)


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from happytransformer import HappyTextClassification
import re
from json import load
import matplotlib.pyplot as plt

In [None]:
# Regex to ignore weird chars

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
       "]+", re.UNICODE)

In [None]:
# How the contacts are saved on each platform
contact_on_wa = ""
contact_on_telegram = ""

# Chat files path
exported_wa_file = ""
exported_tg_file = ""

In [None]:
# Individual texts for sentiment analysis
texts_wa = []
texts_tg = []

# Split words for words analysis 
corpus_wa = []
corpus_tg = []

hours_wa = []
hours_tg = []



# Text Processing

In [None]:
def handleContent(text, pltfm):
    text = emoji_pattern.sub(r'', text)

    if pltfm == "wa":
                
        texts_wa.append(text.lower())

    else:

        texts_tg.append(text.lower())

    for cor in text.lower().split(" "):
        # Ignore common words such as a, and, an, are etc.
        if cor != '' and cor !="\n" and not cor in stopwords.words("english") and cor.isalpha():
            if pltfm == "wa":

                corpus_wa.append(cor)

            else:

                corpus_tg.append(cor)
                    

In [None]:
# Parse WhatsApp Content

with open(exported_wa_file) as chat_txt:
    for ctx in chat_txt.readlines():
        
        if contact_on_wa in ctx:

            text = ctx.split(f"{contact_on_wa}:")[1]

            hour = ctx.split(f"{contact_on_wa}:")[0].split("-")[0].split(",")[1].strip().split(":")[0]

            hours_wa.append(hour)
            
            if not "<Media omitted>" in text:

                handleContent(text, "wa")
                

In [None]:
# Parse Telegram Content

with open(exported_tg_file) as chat_json:
    chat = load(chat_json)

for message in chat['messages']:
    if message.get("from") == contact_on_telegram:

        text = message['text']

        if type(text).__name__ == "str":


            hours_tg.append(message['date'].split("T")[1].split(":")[0])

            handleContent(text, "tg")


# Data Visualization

In [None]:
corpus_df_wa = pd.DataFrame(corpus_wa, columns=["word"])
corpus_df_tg = pd.DataFrame(corpus_tg, columns=["word"])

fig, axes = plt.subplots(nrows=1, ncols=2)
fig.tight_layout(pad=2.0)


corpus_df_wa.value_counts()[:10].plot(kind="barh", label="WhatsApp",ax=axes[0], color="green")
corpus_df_tg.value_counts()[:10].plot(kind="barh", label="Telegram",ax=axes[1])

fig.legend()


In [None]:
corpus_df_all = pd.concat([corpus_df_wa, corpus_df_tg])
corpus_df_all.value_counts()[:10].plot(kind="barh", title="Total Words", color="red")

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
fig.tight_layout(pad=2.0)

hours_df_wa = pd.DataFrame(hours_wa, columns=["hour"])
hours_df_wa.value_counts().plot(kind="barh", color="green", label="WhatsApp", ax=axes[0])

hours_df_tg = pd.DataFrame(hours_tg, columns=["hour"])
hours_df_tg.value_counts().plot(kind="barh", label="Telegram", ax=axes[1])

fig.legend()


In [None]:
hours_df_all = pd.concat([hours_df_wa, hours_df_tg])
hours_df_all.value_counts()[:10].plot(kind="barh", title="Total Hours", color="red")

In [None]:
happy_tc = HappyTextClassification(model_type="DISTILBERT",  model_name="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
sentiments_wa = []
sentiments_tg = []
for text in texts_wa:

    result = happy_tc.classify_text(text)

    sentiments_wa.append(result.label)

for text in texts_tg:

    result = happy_tc.classify_text(text)

    sentiments_tg.append(result.label)


In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
fig.tight_layout(pad=2.0)

sentiments_df_wa = pd.DataFrame(sentiments_wa)
sentiments_df_wa.value_counts().plot(kind="pie", autopct='%.2f%%', label="WhatsApp", ax=axes[0])

sentiments_df_tg = pd.DataFrame(sentiments_tg)
sentiments_df_tg.value_counts().plot(kind="pie", autopct='%.2f%%', label="Telegram", ax=axes[1])



In [None]:
sentiments_df_all = pd.concat([sentiments_df_wa, sentiments_df_tg])
sentiments_df_all.value_counts()[:10].plot(kind="pie", title="Total Sentitment", autopct='%.2f%%')