# Facebook messenger statistics
Paste the path to a directory of a chat you'd like to analyze. It should contain 1 or more json files with names like 'message_1.json', 'message_2.json', etc.<br> The path should be enclosed with single or double quotes, and there should be an 'r' in front of the quotes, like this:<br>
PATH = _r'C:\mypath\mychat'<br>_
When that's ready you can click on Cell in the menu above and select 'Run All'

In [1]:
%matplotlib inline
import stats_pandas as stats
import pandas as pd
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px

init_notebook_mode(connected=True)
 
PATH = r'C:\messenger data\facebook-100002163210723\messages\inbox\gadzetyinspektora_9ZWCplP1iw'
    
chat = stats.load_from_path(PATH)

## General chat statistics

In [2]:
msg_stats_dict = stats.get_msg_stats(chat)
msg_stats = pd.DataFrame.from_dict(msg_stats_dict, columns = ['Total msgs sent', 'Avg msg length', 'Total chars sent'], orient = 'index')


col = 'Total msgs sent'
df2 = msg_stats.sort_values(by = col, ascending = False)
f = px.bar(df2, y = col, x = df2.index, text = col, labels = {'x': 'Sender'}, color = col,
           title = 'Total messages sent', color_continuous_scale = px.colors.sequential.RdPu)
f.update(layout_coloraxis_showscale=False)
#f.update_traces(textposition = 'outside')
f

In [3]:
col = 'Avg msg length'
df2 = msg_stats.sort_values(by = col, ascending = False)
f = px.bar(df2, y = col, x = df2.index, labels = {'x': 'Sender'}, text = col ,color = col,
           title = 'Average characters per message', color_continuous_scale = px.colors.sequential.Sunset)
f.update_traces(texttemplate='%{text:.1f}')
f.update(layout_coloraxis_showscale=False)
f

In [4]:
col = 'Total chars sent'
df2 = msg_stats.sort_values(by = col, ascending = False)
f = px.bar(df2, y = col, x = df2.index, labels = {'x': 'Sender'}, color = col, text = col,
           title = 'Total characters sent', color_continuous_scale = px.colors.sequential.PuBu)
f.update_traces(textposition = 'outside')
f.update(layout_coloraxis_showscale=False)
f

## Does average message length correlate with the number of messages sent?

In [5]:
fig = px.scatter(msg_stats, y = 'Avg msg length', x = 'Total msgs sent', trendline = 'ols', trendline_color_override = 'red')
fig.show()
cor = msg_stats.corr().iloc[0,1]
cor = msg_stats.corr().loc['Total msgs sent', 'Avg msg length']
print('The correlation is', cor)

The correlation is 0.03882538924421278


## Top 20 words most frequently used by each chat participant
You can change the minimal word lengths below

In [6]:
#change excluded words here:  excluded = ['example', 'example2']
excluded = ['example', 'example2']


#### change min_word_length here
min_word_length = 6 


word_counts = stats.get_word_counts(chat, filter_participants_names = True, min_len = min_word_length, exclude_words = excluded)

In [7]:
plt_titles = [f'{sender} - min. word length = {min_word_length}' for sender in word_counts]
fig = make_subplots(
    rows=len(word_counts),
    cols=1,
    subplot_titles=plt_titles,
)
for i,sender in enumerate(word_counts):
    df = pd.DataFrame(word_counts[sender].items())[0:20]
    df.columns = ['word', 'count']
    f = px.bar(df, y = 'count', x = 'word', color = df.index)
    fig.add_trace(f.data[0], row = i + 1, col = 1)
fig.update_layout(height=len(word_counts) * 350, width=900)
fig.update(layout_coloraxis_showscale=False)
fig.show()

## How often has _word1_ been used?
Set _word1_ below - it can be a single word, but also be a list of words.
If regex=True is specified, word1 can also be a re str pattern, or a list of re str patterns.

In [8]:
word_counts = stats.get_word_counts(chat, False, None, 1) #get unfiltered word_counts


In [9]:



word1 = ['prosze', 'proszƒô'] ## set word1 here



#w1_coeffs = stats.get_kurwa_coefficients(word_counts_unfiltered, )
w1_coeffs = stats.word_usage_coefficients(word1, word_counts, msg_stats_dict, regex = False)
w1_coeffs = {k:v * 100 for k,v in sorted(w1_coeffs.items(), key = lambda item: item[1], reverse = True)}
w1_df = pd.DataFrame(w1_coeffs.items(), columns = ['Sender', f'{word1} per 100 msgs'])
fig = px.bar(w1_df, y = f'{word1} per 100 msgs', x = 'Sender')
fig.show()

## How often has every (polish) swear word been used?

In [10]:
pcoeffs = stats.get_profanity_coefficients(word_counts, msg_stats_dict, ignore_kurwas = False)

pcoeffs = {k:v * 100 for k,v  in sorted(pcoeffs.items(), key = lambda item: item[1], reverse = True)}
df = pd.DataFrame(pcoeffs.items(), columns = ['Sender', 'Profanity per 100 msgs'])
fig = px.bar(df, y = 'Profanity per 100 msgs', x = 'Sender')
fig.show()
df

Unnamed: 0,Sender,Profanity per 100 msgs
0,Maciek G√≥rski,6.312522
1,Bartek Kr√≥lak,6.145893
2,Filip Perzanowski,6.139315
3,Pawe≈Ç Budniak,6.076067
4,Piotrek Jaworski,4.323212
5,Jakub GƒÖsior,4.103479


## Correlation of _word1_ usage with (polish) swear word usage

In [11]:
df = pd.DataFrame(pcoeffs.items(), columns = ['Sender', 'Profanity per 100 msgs'])
df = df.merge(w1_df, how = 'inner')
cor = df.corr().iloc[0,1]
fig = px.scatter(df,y = 'Profanity per 100 msgs', x = f'{word1} per 100 msgs', trendline="ols", trendline_color_override = 'pink')
fig.show()
print('The correlation is', cor)

The correlation is 0.8152080978289258


## Correlation of _word1_ (set above) usage with _word2_ (set below) usage

In [12]:

word2 = ['dziƒôkujƒô', 'dziekuje'] # set word2 here



if word1 == word2:
    print('word 2 has to be different from word1')
w2_coeffs = stats.word_usage_coefficients(word2, word_counts, msg_stats_dict, regex = False)
w2_coeffs = {k:v * 100 for k,v in w2_coeffs.items()}
w2_df = pd.DataFrame(w2_coeffs.items(), columns = ['Sender', f'{word2} per 100 msgs'])

nat_join = w2_df.merge(w1_df, how = 'inner')
cor = nat_join.corr().iloc[0,1]
fig = px.scatter(nat_join,y = f'{word2} per 100 msgs', x = f'{word1} per 100 msgs', trendline="ols", trendline_color_override = 'pink')
fig.show()
print('The correlation is', cor)

The correlation is -0.6367774639845696


In [13]:
word = 'kurwa'
def count_word(series):
    total = 0
    for msg in series:
        if not pd.isna(msg):
            total += msg.count(word) 
            
    return total

frequency = 'M'
df = chat.copy()
df.index = stats.epoch_to_date(df['timestamp_ms'])
df = df.groupby(pd.Grouper(freq = frequency))['content'].apply(count_word)
df
# f = px.line(df, y = df, x = df.index, labels = {'y': 'Word usage', 'x': 'Date'})
# f.show()

timestamp_ms
2020-01-31 00:00:00+01:00    55
2020-02-29 00:00:00+01:00    37
2020-03-31 00:00:00+02:00    43
2020-04-30 00:00:00+02:00    40
2020-05-31 00:00:00+02:00    33
2020-06-30 00:00:00+02:00    25
2020-07-31 00:00:00+02:00    29
2020-08-31 00:00:00+02:00     4
Freq: M, Name: content, dtype: int64

## Types of messages sent

In [14]:
import math

# Set include_txt = True if the basic text message type should be included in the pie charts
msg_types = stats.get_msg_types(chat, include_txt = False)
ceiling = math.ceil(len(msg_types)/2)

fig = make_subplots(
    rows=ceiling,
    cols=2,
    subplot_titles=tuple(msg_types.keys()),
    specs =[[{'type':'domain'}]*2]*ceiling
)

for i,sender in enumerate(msg_types):
    mydf = pd.DataFrame.from_dict(msg_types[sender], orient = 'index', columns = ['count'])
    f = px.pie(mydf, values = 'count', names = mydf.index)
    fig.add_trace(f.data[0], row = math.ceil((i+1)/2), col = (i)%2+1)
fig.update_layout(height=500 * ceiling, width=900)
fig.show()

In [15]:
pd.DataFrame(stats.get_msg_types(chat, include_txt = True)).fillna(0)

Unnamed: 0,Bartek Kr√≥lak,Piotrek Jaworski,Filip Perzanowski,Jakub GƒÖsior,Pawe≈Ç Budniak,Maciek G√≥rski
photos,75.0,105.0,105.0,46.0,140.0,368.0
videos,4.0,12.0,25.0,0.0,3.0,35.0
sticker,4.0,0.0,0.0,4.0,0.0,5.0
share,6.0,10.0,21.0,4.0,25.0,61.0
txt,1651.0,1815.0,3233.0,1066.0,2001.0,5357.0
audio_files,0.0,0.0,3.0,0.0,0.0,0.0
gifs,0.0,0.0,5.0,5.0,3.0,15.0
files,0.0,0.0,0.0,0.0,0.0,1.0


## Messages over time

In [26]:
dates = stats.groupby_date(chat, 'M')
dates.index = dates.index.strftime("%b %Y")
dates

f = px.line(dates, y = dates, x = dates.index, labels = {'y': 'Message count', 'x': 'Date'})
if (len(dates) > 20):
    f.update_xaxes(nticks = 20)
f.show()

## Messages by time intervals

In [17]:
intervals = ['Y', 'M', 'W', 'H']
fullnames = {'Y': 'Year', 'M': 'Month', 'W': 'Weekday', 'H': 'Hour'}

plt_titles = [fullnames[i] for i in intervals]
fig = make_subplots(
    rows=len(intervals),
    cols=1,
    subplot_titles=plt_titles,
)

for i,interval in enumerate(intervals):
    series = stats.groupby_time(chat, interval, interval_names = True)
    
    f = px.bar(series, x = series.index, y = series, labels = {'x': fullnames[interval], 'y': 'Message count'})
    fig.add_trace(f.data[0], row = i + 1, col = 1)
fig.update_layout(height=len(intervals) * 500, bargap = 0.4)
#series = stats.groupby_time(chat, 'M', interval_names = True)
#px.bar(series, x = series.index, y = series, labels = {'x': fullnames[interval], 'y': 'Message count'})

## Total reactions used in the chat

In [18]:
total_reacts = pd.Series(stats.total_reacts(chat)).sort_values(ascending = False)
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=('Total reactions flat', 'Total reactions percentages'),
    #specs =[[, {'type':'domain'}]]
    specs = [[{'type': 'bar'}, {'type':'domain'}]]
)
f1 = px.bar(x=total_reacts.index, y=total_reacts)
f2 = px.pie(values = total_reacts, names = total_reacts.index)
f2.update_traces(textposition='inside', textinfo='percent+label')
fig.add_trace(f1.data[0], row = 1, col = 1) 
fig.add_trace(f2.data[0], row = 1, col = 2) 
fig.show()

## Reactions used by participant
Seaborn/matplotlib can't display emojis on labels, so for pretty emoji graphs try the plotly notebook version

In [19]:
used, received = stats.reaction_stats(chat)
used

Unnamed: 0,üò¢,‚ù§,üòÆ,üëé,üëç,üòÜ,üòç,üò†,üíó,total,most_used
Bartek Kr√≥lak,5.0,17.0,16.0,4.0,27.0,16.0,7.0,0.0,0.0,92.0,üëç
Piotrek Jaworski,18.0,30.0,15.0,5.0,14.0,28.0,11.0,3.0,0.0,124.0,‚ù§
Filip Perzanowski,7.0,11.0,8.0,11.0,14.0,10.0,1.0,3.0,0.0,65.0,üëç
Jakub GƒÖsior,9.0,21.0,22.0,3.0,11.0,7.0,3.0,1.0,0.0,77.0,üòÆ
Pawe≈Ç Budniak,2.0,0.0,20.0,3.0,17.0,6.0,2.0,1.0,1.0,52.0,üòÆ
Maciek G√≥rski,2.0,140.0,1.0,38.0,28.0,2.0,11.0,3.0,0.0,225.0,‚ù§


## Reactions received by participant

In [20]:
received

Unnamed: 0,‚ù§,üëé,üëç,üòÆ,üòÜ,üòç,üò¢,üò†,üíó,total,most_received
Bartek Kr√≥lak,8.0,5.0,3.0,8.0,4.0,3.0,0.0,0.0,0.0,31.0,‚ù§
Piotrek Jaworski,21.0,12.0,17.0,16.0,16.0,3.0,6.0,1.0,0.0,92.0,‚ù§
Filip Perzanowski,16.0,9.0,16.0,13.0,10.0,8.0,5.0,1.0,0.0,78.0,‚ù§
Jakub GƒÖsior,12.0,3.0,16.0,14.0,3.0,11.0,8.0,0.0,1.0,68.0,üëç
Pawe≈Ç Budniak,18.0,33.0,23.0,11.0,9.0,4.0,9.0,5.0,0.0,112.0,üëé
Maciek G√≥rski,144.0,2.0,36.0,20.0,27.0,6.0,15.0,4.0,0.0,254.0,‚ù§


## Ratio of each reaction received to total messages sent by each person

In [21]:
percents = stats.react_percents(received, msg_stats_dict)
percents

Unnamed: 0,‚ù§,üëé,üëç,üòÆ,üòÜ,üòç,üò¢,üò†,üíó,total,most_received
Bartek Kr√≥lak,0.46%,0.29%,0.17%,0.46%,0.23%,0.17%,0.00%,0.00%,0.00%,1.78%,‚ù§
Piotrek Jaworski,1.08%,0.62%,0.87%,0.82%,0.82%,0.15%,0.31%,0.05%,0.00%,4.73%,‚ù§
Filip Perzanowski,0.47%,0.27%,0.47%,0.38%,0.30%,0.24%,0.15%,0.03%,0.00%,2.30%,‚ù§
Jakub GƒÖsior,1.07%,0.27%,1.43%,1.25%,0.27%,0.98%,0.71%,0.00%,0.09%,6.07%,üëç
Pawe≈Ç Budniak,0.83%,1.53%,1.07%,0.51%,0.42%,0.19%,0.42%,0.23%,0.00%,5.19%,üëé
Maciek G√≥rski,2.48%,0.03%,0.62%,0.34%,0.47%,0.10%,0.26%,0.07%,0.00%,4.38%,‚ù§


## People who receive the most reactions

In [22]:
titles = [ ('funniest', 'üòÜ'), ('most hated','üëé'), ('most infuriating', 'üò†'),
           ('most beloved', 'üòç'), ('most shocking', 'üòÆ'), ('saddest', 'üò¢'), ('most liked', 'üëç'), ('beloved v3', 'üíó'), ('beloved v2', '‚ù§')]

for title, emoji in titles:
    stats.most_reactions(received, title, emoji, percent = False)

The funniest person is Maciek G√≥rski: 27.0 of his messages received 'üòÜ', 2nd place: Piotrek Jaworski (16.0)

The most hated person is Pawe≈Ç Budniak: 33.0 of his messages received 'üëé', 2nd place: Piotrek Jaworski (12.0)

The most infuriating person is Pawe≈Ç Budniak: 5.0 of his messages received 'üò†', 2nd place: Maciek G√≥rski (4.0)

The most beloved person is Jakub GƒÖsior: 11.0 of his messages received 'üòç', 2nd place: Filip Perzanowski (8.0)

The most shocking person is Maciek G√≥rski: 20.0 of his messages received 'üòÆ', 2nd place: Piotrek Jaworski (16.0)

The saddest person is Maciek G√≥rski: 15.0 of his messages received 'üò¢', 2nd place: Pawe≈Ç Budniak (9.0)

The most liked person is Maciek G√≥rski: 36.0 of his messages received 'üëç', 2nd place: Pawe≈Ç Budniak (23.0)

The beloved v3 person is Jakub GƒÖsior: 1.0 of his messages received 'üíó', 2nd place: Bartek Kr√≥lak (0.0)

The beloved v2 person is Maciek G√≥rski: 144.0 of his messages received '‚ù§', 2nd place:

## People who receive the most reactions relative to the number of messages they send

In [23]:
for title, emoji in titles:
    stats.most_reactions(percents, title, emoji, percent = True)

The funniest person is Piotrek Jaworski: 0.82% of his messages received 'üòÜ', 2nd place: Maciek G√≥rski (0.47%)

The most hated person is Pawe≈Ç Budniak: 1.53% of his messages received 'üëé', 2nd place: Piotrek Jaworski (0.62%)

The most infuriating person is Pawe≈Ç Budniak: 0.23% of his messages received 'üò†', 2nd place: Maciek G√≥rski (0.07%)

The most beloved person is Jakub GƒÖsior: 0.98% of his messages received 'üòç', 2nd place: Filip Perzanowski (0.24%)

The most shocking person is Jakub GƒÖsior: 1.25% of his messages received 'üòÆ', 2nd place: Piotrek Jaworski (0.82%)

The saddest person is Jakub GƒÖsior: 0.71% of his messages received 'üò¢', 2nd place: Pawe≈Ç Budniak (0.42%)

The most liked person is Jakub GƒÖsior: 1.43% of his messages received 'üëç', 2nd place: Pawe≈Ç Budniak (1.07%)

The beloved v3 person is Jakub GƒÖsior: 0.09% of his messages received 'üíó', 2nd place: Bartek Kr√≥lak (0.00%)

The beloved v2 person is Maciek G√≥rski: 2.48% of his messages receive

## Messages that received the biggest number of each reaction

In [24]:
stats.print_reaction_records(chat)


 Jakub GƒÖsior : üëç
Kto nie idzie jutro do szko≈Çy ≈Çapka w g√≥rƒô

 Piotrek Jaworski : üëé
Chce kto≈õ na p√≥j≈õƒá na jakie≈õ wpierdalanie?

 Maciek G√≥rski : üòÜ
<media>

 Maciek G√≥rski : ‚ù§
<media>

 Jakub GƒÖsior : üòç
Cipa penis zawsze ≈õmieszne

 Jakub GƒÖsior : üòÆ
Bƒôdƒô solo

 Pawe≈Ç Budniak : üò†
a to jednak jebac was

 Pawe≈Ç Budniak : üò¢
niestety maja na mnie totalnie wypierdolone

 Jakub GƒÖsior : üíó
<media>


## Top *n* record-holders of reactions recevied with more details and *m* adjacent messages for context

In [25]:
stats.emojis.keys()

dict_keys(['LIKE_EMOJI', 'DISLIKE_EMOJI', 'LAUGHING_EMOJI', 'HEART_EMOJI', 'HEART_EYES_EMOJI', 'SHOCKED_EMOJI', 'ANGRY_EMOJI', 'SAD_EMOJI', 'HEART_EMOJI_2'])

In [26]:
n = 3
m = 2
reaction_emoji = stats.emojis['LIKE_EMOJI'] # pick react type from keys displayed above

stats.most_reacted_msgs(chat, PATH, reaction_emoji, how_many = n, context = m)

	 sender: Pawe≈Ç Budniak  2020-06-07 20:00:08.644000
	content:  przykre ze maciek nigdy nie spelnil swoich marzen 

	 sender: Pawe≈Ç Budniak  2020-06-07 20:00:11.379000
	content:  o lapkach w gure 

üëç: 6,  sender: Jakub GƒÖsior  2020-06-07 20:01:39.096000
content:  Kto nie idzie jutro do szko≈Çy ≈Çapka w g√≥rƒô 

	 sender: Pawe≈Ç Budniak  2020-06-07 20:06:39.854000
	content:  a na grupie ursynowskiej to chyba nie wiedz ze mozna dawac reakcje 

	 sender: Pawe≈Ç Budniak  2020-06-07 20:06:43.960000
	content:  wszystkie wiadomosci maja max 2 



	 sender: Piotrek Jaworski  2020-05-28 15:16:34.506000
	content:  Zaliczy≈Çem komplet 

	 sender: Maciek G√≥rski  2020-05-28 15:47:02.029000
	content:  uzupe≈Çnijmy te statystki 

üëç: 4,  sender: Maciek G√≥rski  2020-05-28 15:47:16.477000
content:  je≈õli tw√≥j penis ma
wiƒôcej ni≈º 3cm daj ≈Çapkƒô w g√≥rƒô 

	 sender: Maciek G√≥rski  2020-05-28 15:47:35.507000
	content:  no ja my≈õle 

	 sender: Maciek G√≥rski  2020-05-28 15:48:25.786000
	con