# Exploratory Data Analysis of Telegram Messages

Do not forget to enter the path to your data and your telegram ID. Also, any question section can be customized in any way to reach appropriate results.

In [None]:
# install dependencies
# %pip install polyglot pyicu pycld2
# %pip install emoji==1.6.3
# %pip install text2emotion
# %pip install googletrans==3.1.0a0

In [None]:
import pandas as pd

# import and setup polyglot (lang detector)
from polyglot.detect import Detector
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

# import translator
from googletrans import Translator

# emotion anaylzer
import text2emotion as te

# import nltk lib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# standard libs
import json
import ast
import string
from datetime import datetime, date, time

In [None]:
DIALOGS_MERGED_DATA_PATH = "<PATH_TO_DATA>/dialogs_data_all.csv"
DIALOGS_META_MERGED_DATA_PATH = "<PATH_TO_DATA>/dialogs_users_all.csv"

In [None]:
df = pd.read_csv(DIALOGS_MERGED_DATA_PATH)
df_meta = pd.read_csv(DIALOGS_META_MERGED_DATA_PATH)

# General Info

Total number of pulled messages:

In [None]:
df['id'].count()

Get number of dialogs:

In [None]:
df_meta['dialog_id'].nunique()

Telegram ID:

In [None]:
user_id = '<telegram ID>'

# Data pre-processing

In [None]:
# convert date
df['date']= df['date'].apply(pd.to_datetime)
df['dialog_id'] = df['dialog_id'].astype(str)

# Meta info analysis

## 1. What are the most common languages of chat names?

In [None]:
def detect_lang(text: str):
    try:
        return Detector(text).languages[0].name
    except Exception:
        return 'undefined'

df_meta.drop_duplicates(subset=['dialog_id'])['name'].apply(detect_lang).value_counts().head(7).plot.bar(
    title='What language is the nickname written in?',
    xlabel='Language',
    ylabel='Number'
)

## 2. Total number of group and private chats

In [None]:
df_meta.groupby('dialog_id').first()['type'].value_counts().plot.pie(
    autopct='%1.0f%%',
    ylabel=None,
    title="Proportion of chat types"
)

## 3. Which chats have the largest number of "familiar" people?

In [None]:
chat_ids = df_meta['dialog_id'].unique()
condition = chat_ids > 0
familiar_ppl = chat_ids[condition]

In [None]:
df_meta['user_id'] = df_meta['users'].apply(lambda x: ast.literal_eval(x)['user_id'])

In [None]:
df_meta['is_familiar'] = df_meta['user_id'].apply(lambda x: x in familiar_ppl)
member_data = df_meta[df_meta['dialog_id'] < 0].groupby('dialog_id')['is_familiar'].agg(['sum', 'count']).sort_values(
    by='sum', ascending=False
).head(10).rename(columns={'sum': 'Familiar members', 'count': 'Total members'})


graph = member_data.join(df_meta.set_index('dialog_id').groupby('dialog_id').first()['name'], on='dialog_id', how='left')

graph.plot.bar(x='name', title='Number of Familiar members in group chats', ylabel='members')

## 4. Which group chats has the most members?

In [None]:
top_n = 5
df_meta[df_meta['dialog_id'] < 0].groupby('dialog_id').agg({'name': 'first', 'type': 'count'}).sort_values(
    by='type', ascending=False
).head(top_n).plot.bar(
    title=f'Top {top_n} chats by number of members', x='name', ylabel='Number of members', legend=False
)

## 5. How many people are in all the group chats?

In [None]:
# let's count all chat members
len(df_meta[df_meta['type'] == 'Group'])

In [None]:
# but how many UNIQUE members are there?
len(df_meta[df_meta['type'] == 'Group']['user_id'].unique())

# Messages analysis

## 1. What is the distribution of language usage in the sent messages?

In [None]:
sent_data = df[df['from_id'].str.contains(user_id, na=False)]
sent_data['lang'] = sent_data['message'].apply(detect_lang)

In [None]:
# pass your own languages to analyze their distribution
ru_sent = sent_data[sent_data['lang'] == 'Russian'].groupby(sent_data.date.dt.date)['id'].count()
ukr_sent = sent_data[sent_data['lang'] == 'Ukrainian'].groupby(sent_data.date.dt.date)['id'].count()

ru_sent.plot(
    figsize=(10,5), label='russian', title='Language usage distribution in sent messages',
    xlabel='Year', ylabel='Number of messages'
)
ax = ukr_sent.plot(label='Ukrainian')
ax.legend(loc='upper left')

## 2. What are the proportions of personal pronouns in sent messages?

In [None]:
sent_messages = df[df['from_id'].str.contains(user_id, na=False)]['message']
sent_tokens = sent_messages.dropna().apply(lambda msg: word_tokenize(msg)).explode()

In [None]:
pronouns = ['я', 'ти', 'ми', 'він', 'вони', 'вона', 'воно']
map_pronouns = {
     'мы': 'ми', 'она': 'вона', 'он': 'він', 'они': 'вони', 'оно': 'воно', 'ты': 'ти'
}

for _from, _to in map_pronouns.items():
    sent_tokens.replace(_from, _to, inplace=True)

In [None]:
sent_tokens.value_counts()[pronouns].plot.pie(
    autopct='%1.0f%%',
    title='Proportions of personal pronouns in sent messages',
    ylabel=None
)

## 3. Distribution of using of words 'war', 'invasion'

In [None]:
words = ['війн.{1,2}', 'войн.{1,2}', 'war', 'вторгненн.{1,2}', 'вторжени.{1,2}']
ax = df[df['message'].str.contains('|'.join(words), na=False)].groupby(sent_data.date.dt.date)['id'].count().plot(
    figsize=(10,5), title='Distribution of using of words war, invasion etc.',
    xlabel='Year', ylabel='Number of messages', label='War-like word usage'
)
ax.scatter('2022-02-24', 1, color='red', zorder=100, label='24 February 2022')
ax.legend(loc='upper left')

## 4. Distribution of number of words in sent messages

In [None]:
sent_messages = df[df['from_id'].str.contains(user_id, na=False)]['message']
print(f'Total number of sent messages: {len(sent_messages)}')

tokenized_messages = sent_messages.dropna().apply(lambda msg: word_tokenize(msg))
tokenized_messages.apply(
    lambda tokens: len(list(filter(lambda t: t not in string.punctuation, tokens)))
).value_counts().sort_index().head(20).plot.bar(
    title='Distribution of number of words in sent messages',
    xlabel='Word count', ylabel='Number of messages'
)


## 5. The most used words in SENT messages

In [None]:
import re

emoji_pattern = re.compile("[^"
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        u"\u1F32F"
        u"\u1F44D"
        "]+", re.UNICODE)

def process_tokens(tokens):
    return tokens.str.lower().apply(lambda t: t.replace('ё', 'е')).filter(regex=emoji_pattern)

def most_used_words(messages, filter_tokens, n=10):
    tokens = messages.dropna().apply(lambda msg: word_tokenize(msg)).explode()
    
    processed = process_tokens(tokens)
    
    return processed[~processed.isin(filter_tokens)].value_counts().head(n)

In [None]:
sent_messages = df[df['from_id'].str.contains(user_id, na=False)]['message']
sent_tokens = sent_messages.dropna().apply(lambda msg: word_tokenize(msg)).explode()

In [None]:
# filter out tokens
extra = ['...', "''", "``", '—', 'это', '«', '»', 'https', 'http', '👍']
stop_words_ukr = ['це', 'з', 'і', 'та', 'що', 'шо', 'як', 'але', 'чи', 'бо', 'й']
stop_tokens = list(string.punctuation) + stopwords.words('english') + stopwords.words('russian') + stop_words_ukr + extra

In [None]:
print(f'Total number of sent tokens: {len(sent_tokens)}')
print(f'Number of unique sent tokens: {len(sent_tokens.unique())}')

most_used_words(sent_messages, stop_tokens, 20).plot.bar(
    title='The most used words in sent messages',
    ylabel='Number of words'
)

## 6. The most used words in all chats

In [None]:
most_used_words(df['message'].dropna(), stop_tokens, 20).plot.bar(
    title='The most used words in all chats',
    ylabel='Number of words'
)

## 7. The most used words in SENT messages after 24 February

In [None]:
sent_messages_after_24 = df[(df['from_id'].str.contains(user_id, na=False)) & (df['date'] > '2022-02-24')]['message']
print(f'Number of sent messages after 24.02.2022: {len(sent_messages_after_24)}')

In [None]:
most_used_words(sent_messages_after_24, stop_tokens, 20).plot.bar(
    title='The most used words after 24.02.2022 in sent messages',
    ylabel='Number of words'
)

## 8. Proportion of different message types (will be used in h/w #6)

In [None]:
df['type'].value_counts().plot.pie(
    autopct='%1.0f%%',
    title='Proportion of different message types in sent messages',
    ylabel=None
)

## 9. Proportion of different message types in SENT messages

In [None]:
df[df['from_id'].str.contains(user_id, na=False)]['type'].value_counts().plot.pie(
    autopct='%1.0f%%',
    title='Proportion of different message types',
    ylabel=None
)

## 10. What emotions were in the messages on February 24, 2022?

In [None]:
translator = Translator()

In [None]:
chats = df[df['dialog_id'].astype(int) > 0]
emotions = chats[chats['date'].dt.date == date(2022, 2, 23)]['message'].dropna().apply(
    lambda msg: te.get_emotion(translator.translate(msg, src='uk', dest='en').text)
)

In [None]:
emotions.apply(pd.Series).mean().plot.pie(
    autopct='%1.0f%%',
    title='Proportion of emotions in messages on February 23, 2022',
    ylabel=None
)

In [None]:
chats = df[df['dialog_id'].astype(int) > 0]
emotions = chats[chats['date'].dt.date == date(2022, 2, 24)]['message'].dropna().apply(
    lambda msg: te.get_emotion(translator.translate(msg, src='uk', dest='en').text)
)

In [None]:
emotions.apply(pd.Series).mean().plot.pie(
    autopct='%1.0f%%',
    title='Proportion of emotions in messages on February 24, 2022',
    ylabel=None
)

## 11. How are my sent messages distributed throughout the day?

In [None]:
df[df['from_id'].str.contains(user_id, na=False)]['date'].apply(
    lambda d: d.time().replace(second=0)
).value_counts().sort_index().plot(
    figsize=(10,5), title='Message distribution during the day',
    xlabel='Time', ylabel='Number of messages',
    xticks=[time(i, 0, 0) for i in range(0, 24, 2)] + [time(23, 59, 0)]
)

## 12. On which days were the most messages in private chats?

In [None]:
msg_per_day = df[df['dialog_id'].astype(int) > 0].apply(
    lambda x: x.date.date(), axis=1
).value_counts().sort_values(ascending=False).head(10)
msg_per_day

## 13. What emotions did messages have on the most active days?

In [None]:
chats = df[df['dialog_id'].astype(int) > 0]

In [None]:
from googletrans import Translator
translator = Translator()

In [None]:
emotions_on_day = {}
for dt in msg_per_day.index[0:3]:
    emotions_on_day[dt] = chats[chats['date'].dt.date == dt]['message'].dropna().apply(
        lambda msg: te.get_emotion(translator.translate(msg, src='uk', dest='en').text)
    )
    print(f'Emotions per {dt}') # for checking progress

In [None]:
list(emotions_on_day.values())[0].apply(pd.Series).mean().plot.pie(
    autopct='%1.0f%%',
    title=f'Proportion of emotions in messages on 11 November, 2020',
    ylabel=None
)

In [None]:
list(emotions_on_day.values())[1].apply(pd.Series).mean().plot.pie(
    autopct='%1.0f%%',
    title=f'Proportion of emotions in messages on 15 September, 2019',
    ylabel=None
)

In [None]:
list(emotions_on_day.values())[2].apply(pd.Series).mean().plot.pie(
    autopct='%1.0f%%',
    title=f'Proportion of emotions in messages on 23 September, 2019',
    ylabel=None
)

# Merged data analysis

In [None]:
chats_meta = df_meta[['dialog_id', 'name']].drop_duplicates()
personal_chats = df[df['dialog_id'].astype(int) > 0].astype({'dialog_id': 'int'}).merge(chats_meta)
group_chats = df[df['dialog_id'].astype(int) < 0].astype({'dialog_id': 'int'}).merge(chats_meta)
all_chats = df.astype({'dialog_id': 'int'}).merge(chats_meta)

## 1. Chats with which PEOPLE have the most text messages?

In [None]:
personal_chats[personal_chats['type'] == 'text'].groupby('name')['type'].count().sort_values(
    ascending=False
).head(5).plot.bar(
    title='Number of text messages in chat', ylabel='Number of text messages'
)

## 2. Chats with which PEOPLE have the most audio messages?

In [None]:
personal_chats[personal_chats['type'] == 'voice'].groupby('name')['type'].count().sort_values(
    ascending=False
).head(5).plot.bar(
    title='Number of audio messages in chat', ylabel='Number of audio'
)

## 3. Chats with which PEOPLE have the most video messages?

In [None]:
personal_chats[personal_chats['type'] == 'video'].groupby('name')['type'].count().sort_values(
    ascending=False
).head(5).plot.bar(
    title='Number of video messages in chats', ylabel='Number of videos'
)

## 4. Which GROUP chats have the most sticker messages?

In [None]:
group_chats[group_chats['type'] == 'sticker'].groupby('name')['type'].count().sort_values(
    ascending=False
).head(5).plot.bar(
    title='Number of sticker messages in group chats', ylabel='Number of stickers'
)

## 5. In what proportion do my interlocutors use pronouns?

In [None]:
pronouns = ['я', 'ти', 'ми', 'він', 'вони', 'вона', 'воно']
map_pronouns = {
     'мы': 'ми', 'она': 'вона', 'он': 'він', 'они': 'вони', 'оно': 'воно', 'ты': 'ти'
}

def count_pronouns(tokens):
    _t = [map_pronouns[t] if t in map_pronouns else t for t in tokens]
    return pd.Series([_t.count(pronoun) for pronoun in pronouns], index=pronouns)

pronouns_count = all_chats[all_chats['to_id'] == user_id].dropna(subset=['message']).apply(
    lambda msg: count_pronouns(word_tokenize(msg['message'])), axis=1
)

In [None]:
pronouns_data = all_chats.merge(pronouns_count, left_index=True, right_index=True).groupby('name')[pronouns].sum()
pronouns_data['total'] = pronouns_data.sum(axis=1)
pronouns_data.sort_values(by='total', ascending=False).head(10)[pronouns].plot.bar(
    stacked=True, title='Proportion of pronouns in my interlocutors messages',
    ylabel='Pronoun frequency', figsize=(5, 7)
)