## Telegram group chat data analysis

For downloading chat data, go to Settings > Advanced > Export Telegram data > Machine-readable JSON

For downloading for a chat individually, JSON format is not available (but there is a github issue on that, so hopefully it will be available soon)

In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [93]:
file = 'result.json' # You may change this if you changed the name of the file
with open(file) as file:
    dict_data = json.load(file)
dict_data # Data loaded to dictionary

{'about': "Here is the data you requested. Remember: Telegram is ad free, it doesn't use your data for ad targeting and doesn't sell it to others. Telegram only keeps the information it needs to function as a secure and feature-rich cloud service.\n\nCheck out Settings > Privacy & Security on Telegram's mobile apps for the relevant settings.",
 'chats': {'about': 'This page lists all chats from this export.',
  'list': [{'name': 'Cafetería VIP',
    'type': 'private_group',
    'id': 8962178252,
    'messages': [{'id': 701586,
      'type': 'service',
      'date': '2019-01-18T17:33:31',
      'edited': '1970-01-01T01:00:00',
      'actor': 'Alejandro',
      'actor_id': 339383956,
      'action': 'create_group',
      'title': 'Cafetería de emergencias',
      'members': ['Alejandro',
       'Angel',
       'Raquel',
       'Pink Lady',
       'Nuria',
       'nil',
       'DGM',
       'Carmen'],
      'text': ''},
     {'id': 701587,
      'type': 'service',
      'date': '2019-01-1

In [127]:
# Messages are extracted for the selected group
chat_name='test_name' # the name of the group chat
chat_data_list=dict_data['chats']['list']
for chat in chat_data_list:
    if chat['name']==chat_name:
        chat_data=chat
        break
chat_data_fil=[] # filter events like picture change or photos
# chat_members=[chat_data['messages'][0]['members']] # Initial members
# print("Chat members", chat_members)
for m in chat_data['messages']:
    if (m['type'] == 'message' and m['text']!=''):
        chat_data_fil.append(m)

In [126]:
# Convert to dataframe
df_chat = pd.DataFrame.from_dict(chat_data_fil)
# Only want user and message, although more info like date could be kept
col_sel=['from','text']
df_chat=df_chat[col_sel]

In [120]:
usernames=[]
messages=[]
for user, mes in tele.groupby(['from']):
    usernames.append(user)
    messages.append(mes['text'])
print("Users: ", usernames)

Users:  ['Alejandro', 'Alex Martínez', 'Angel', 'Carmen', 'Chat Against Humanity', 'Combot', 'DGM', 'Edu', 'Jesús Tamayo', 'Juan', 'Maria', 'Matemáticas UNED', 'MathNugget', 'Miguel J.', 'Naruto Pérez Reverte', 'Nelson', 'Nuria', 'Pink Lady', 'Raquel', 'Seila', 'Transcriber Bot', 'Ulises Botija', 'Victor', 'nil', 'nyar23', 'Álex y Timia']


In [128]:
# Stopwords are words commonly used in a language that do not provide much information
# For english, sklearn has a defined list, so stopwords='english' may be used
# For other languages or custom words, fill in the list
# E.g for Spanish
stopwords=['de', 'que', 'la', 'es', 'el', 'en','me', 'lo', 'un', 'los', 'una', 'han', 'las', 'al', 'se']

In [121]:
# Function
# Get top words from user 
# Function adapted from Cristhian Boujon at Medium
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(user, stopwords, n=None):
    user_index=usernames.index(user)
    corpus=[str (item) for item in messages[user_index]]
    vec = CountVectorizer(stop_words=stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [124]:
# Now we can get the top words for a user
get_top_n_words('Angel', lista, n=100)

[('yo', 584),
 ('pero', 471),
 ('si', 426),
 ('eso', 323),
 ('como', 323),
 ('hay', 276),
 ('mi', 260),
 ('ya', 258),
 ('te', 255),
 ('más', 250),
 ('del', 241),
 ('type', 225),
 ('text', 225),
 ('muy', 205),
 ('jajaja', 194),
 ('tengo', 176),
 ('todo', 176),
 ('ver', 156),
 ('creo', 152),
 ('algo', 147),
 ('poco', 142),
 ('os', 141),
 ('sí', 139),
 ('soy', 137),
 ('le', 136),
 ('qué', 133),
 ('pues', 128),
 ('tu', 127),
 ('así', 127),
 ('mention', 127),
 ('porque', 127),
 ('nada', 118),
 ('cuando', 117),
 ('he', 116),
 ('este', 115),
 ('ni', 114),
 ('estoy', 114),
 ('son', 112),
 ('eh', 112),
 ('ser', 108),
 ('tiene', 103),
 ('grupo', 102),
 ('bueno', 102),
 ('sin', 100),
 ('era', 97),
 ('voy', 96),
 ('cosas', 96),
 ('bien', 94),
 ('su', 94),
 ('ok', 94),
 ('está', 94),
 ('claro', 94),
 ('hacer', 94),
 ('hace', 90),
 ('solo', 89),
 ('ese', 88),
 ('gente', 86),
 ('esto', 85),
 ('ahora', 85),
 ('tema', 84),
 ('esa', 84),
 ('link', 83),
 ('mucho', 81),
 ('aquí', 81),
 ('todos', 79),
 ('h