In [2]:
#importing packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

In [3]:
messenger_df = pd.read_csv('/Users/seb_coding/Documents/Facebook dataframes/Complete_Facebook_messages.csv', header=None, names=['Date', 'Sender', 'Message', 'Chat', 'Message_type', 'Status', 'Participants'])
messenger_df.head()

In [4]:
#by setting the data types to category the df size is reduced
dtypes = {'Message_type':'category', 'Sender':'category', 'Chat':'category', 'Status':'category', 'Participants':'category'} #reducing memory usage of dataframe

In [5]:
#removing ID's from Chat name

clean_chat_name_regex = r"_.+$"

messenger_df['Chat'] = messenger_df['Chat'].str.replace(clean_chat_name_regex, '')

  messenger_df['Chat'] = messenger_df['Chat'].str.replace(clean_chat_name_regex, '')


In [6]:
#creating new column defining the chat type as a group chat or personal chat
messenger_df['Chat_type'] = np.where(messenger_df['Participants'] == 2, 'Personal_Chat', 'Group_Chat')
messenger_df.head()

In [7]:
#number of people I have chatted with
print(messenger_df.Sender.nunique())

#number of chats that I have been in
print(messenger_df.Chat.nunique())

In [9]:
#number of messages sent in group chat vs personal chat
Groupchat_count_df = messenger_df.groupby(['Chat_type']).size().reset_index(name='Counts')

In [10]:
#Number of group chats and personal chats

Groupchat_count_df2 = messenger_df[['Chat', 'Participants', 'Chat_type']]
Groupchat_count_df3 = Groupchat_count_df2.drop_duplicates()
Groupchat_count_df3.sort_values('Chat')
Groupchat_count_df4 = Groupchat_count_df3.groupby(['Chat_type']).size().reset_index(name='Counts')
Groupchat_count_df5 = pd.merge(Groupchat_count_df, Groupchat_count_df4, on='Chat_type')
Groupchat_count_df5.head()

In [None]:
# Create subplots: use 'domain' type for Pie subplot
labels = ["Group_Chat", "Personal_Chat"]

facebook_colors = ['rgb(47, 71, 122)', 'rgb(78, 113, 186)']

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values= Groupchat_count_df5.Counts_x, name="number of chats", marker_colors=facebook_colors),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values= Groupchat_count_df5.Counts_y, name="Messages sent", marker_colors=facebook_colors),
              1, 2)

    
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Do I have more group chats or personal chats?",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Number of Chats', x=0.135, y=0.5, font_size=15, showarrow=False),
                 dict(text='Messages sent', x=0.85, y=0.5, font_size=15, showarrow=False)])
fig.show()

In [None]:
# the number of group chats and participants 
Groupchat_count_by_participants_df = Groupchat_count_df3[Groupchat_count_df3['Participants'] > 2]
Groupchat_count_by_participants_df2 = Groupchat_count_by_participants_df.groupby(['Participants']).size().reset_index(name='Counts')
fig = px.bar(Groupchat_count_by_participants_df2, x="Participants", y="Counts", color = 'Counts', color_continuous_scale='ice')
fig.update_layout(xaxis_categoryorder = 'total descending', title = 'How many participants are there in my group chats?')
fig.show();

## Top 10 chats

In [None]:
top10_chats_df = messenger_df.groupby(['Chat', 'Chat_type']).size().reset_index(name='Counts')
top10_chats_df = top10_chats_df.sort_values('Counts', ascending = False).head(10)
top10_chats_df['Chat'] = top10_chats_df['Chat'].\
# If you want to annonymize some names use the following lines of code
#replace(['name1','name2', ...],['anonymized name 1','anonymized name 2', ...])
top10_chats_df

In [None]:
fig = px.bar(top10_chats_df, x="Chat", y="Counts", color = "Chat_type", color_discrete_map={
        'Group_Chat': 'rgb(47, 71, 122)',
        'Personal_Chat': 'rgb(78, 113, 186)'})
fig.update_layout(xaxis_categoryorder = 'total descending', title = 'What were my most popular chats?')
fig.show();

In [13]:
BBB_df = messenger_df[messenger_df['Chat'] == 'bbbandsbb']
BBB_df.head()

In [None]:
BBB_counts_df = BBB_df.groupby(['Sender']).size().reset_index(name='Counts').sort_values('Counts', ascending = False)

# If you want to annonymize some names use the following lines of code

#BBB_counts_df['Sender'] = BBB_counts_df['Sender'].replace(['name1','name2', ...],['anonymized name 1','anonymized name 2', ...])

fig = px.bar(BBB_counts_df, x="Sender", y="Counts", color = 'Sender', color_discrete_map={
        'Joe': 'rgb(47, 71, 122)',
        'Anna': 'rgb(78, 113, 186)',
        'me': 'rgb(70,130,180)',
        'Josephine': 'rgb(135,206,235)',
        'Charly': 'rgb(135,206,250)',
        })
fig.update_layout(xaxis_categoryorder = 'total descending', title = 'Who is the most active in my most active group chat?')
fig.show();

# Messages over time

In [14]:
messenger_df['date_time'] = messenger_df['Date']
messenger_df.date_time = pd.to_datetime(messenger_df.date_time)
messenger_df["quarter"]  = messenger_df.date_time.dt.quarter
messenger_df["year"]  = messenger_df.date_time.dt.year
messenger_df['year_quarter'] = messenger_df["year"].astype(str) + "." + messenger_df["quarter"].astype(str)

messenger_df.head()

In [15]:
messenger_count_df = messenger_df.groupby(['year', 'quarter']).size().reset_index(name='Counts')
messenger_count_df['year_quarter'] = messenger_count_df["year"].astype(str) + "." + messenger_count_df["quarter"].astype(str)

messenger_count_df2 = messenger_df.groupby(['year_quarter', 'Chat']).size().reset_index(name='Counts')

In [16]:
Messages_sent_by_me_df = messenger_df[messenger_df['Sender'] == 'add Sender Name']

Messages_sent_by_me_df2 = Messages_sent_by_me_df.groupby(['year', 'Chat']).size().reset_index(name='Counts')
Messages_sent_by_me_df3 = Messages_sent_by_me_df2[Messages_sent_by_me_df2.Counts > 1] 
Messages_sent_by_me_df3.loc[(Messages_sent_by_me_df3.Counts < 180),'Chat']='Others'
Messages_sent_by_me_df3.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
#Anonymizing names
#Messages_sent_by_me_df3['Chat'] = Messages_sent_by_me_df3['Chat'].replace(['name1','name2', ...],['anonymized name 1','anonymized name 2', ...])

#How many message did I send each year and who did I write with?
fig = px.bar(Messages_sent_by_me_df3, x="year", y="Counts", color='Chat')
fig.update_layout(xaxis_categoryorder = 'total descending', title = 'How many message did I send each year?')
fig.show();

# Most common times to write

In [None]:
## preparing the data for the heatmap

#filtering for only messages sent by myself
weekday_df = messenger_df[messenger_df['Sender'] == 'add Sender name']

#transformting the date type to 'datetime'
weekday_df['Date'] = pd.to_datetime(messenger_df['Date'])

#selecting only the needed columns
weekday_df = weekday_df[['Date', 'Sender', 'Message', 'Chat']]

#creating the day of the week column 'Weekday'
weekday_df['Weekday'] = weekday_df['Date'].dt.day_name()
weekday_hour_df = weekday_df

#creating the hour of the week column 'Hour'
weekday_hour_df['Hour'] = weekday_hour_df['Date'].dt.hour

#count the messages per day and hour
heatmap_total_df = weekday_hour_df.groupby(['Weekday', 'Hour']).size().reset_index(name='Counts')

#pivoting the table to match the heatmap
heatmap_df = heatmap_total_df.pivot_table(index='Hour',columns='Weekday',values='Counts', aggfunc=lambda x:x)
#ordering the days of the week
heatmap_df = heatmap_df[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
#set missing numbers to 0
heatmap_df = heatmap_df.fillna(0)
heatmap_df.head()

In [None]:
#creating the heatmap figure

fig = px.imshow(heatmap_df, zmin=0, color_continuous_scale='RdBu_r',
               title="What are the most common times for texting?")

fig.update_layout(
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1),
    coloraxis_colorbar=dict(
    title="Number of Messages")
)

fig.show()

# Most common words

In [18]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from os import path
from PIL import Image

In [None]:
#filter for only message sent my myself
my_common_words_df = messenger_df[messenger_df['Sender'] == 'add Sender Name']

#converting all letters to lower case for more accurate word counts
lower_case_my_common_words_df = my_common_words_df
lower_case_my_common_words_df['Message'] = my_common_words_df['Message'].str.lower()
#lower_case_my_common_words_df.head(5)

In [None]:
#removing unwanted words from the word cloud
banned2 = ['ð\x9f\x98', 'fuck', 'die', 'ð']
f = lambda x: ' '.join([item for item in x.split() if item not in banned2])
lower_case_my_common_words_df["Message"] = lower_case_my_common_words_df["Message"].apply(f)

In [26]:
#limiting the date range as spacey can only analyze data sets with less than 1.000.000 words
lower_case_my_common_words_df2 = lower_case_my_common_words_df.loc[lower_case_my_common_words_df['Date'] >= '2020-06-03 20:33:51']
text = " ".join(review for review in lower_case_my_common_words_df2.Message)

In [27]:
import spacy
from spacy.lang.en.examples import sentences 
from collections import Counter

#text_file = open("common_words.txt", "w")
#n = text_file.write(text)
#text_file.close()

In [28]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"displaCy uses JavaScript, SVG and CSS.")

from collections import Counter

docx = nlp(open('common_words.txt').read())

In [None]:
docx

In [None]:
# Remove Punct,Stop 
# Nouns
nouns = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'NOUN']
word_freq = Counter(nouns)
common_nouns = word_freq.most_common(22)
print(common_nouns)

In [32]:
# function to convert list to string
def dict_from_asslist(ass_list):
    return_dict = {}
    for (key, value) in ass_list:
        return_dict[key] = value
    return return_dict
dict_from_asslist.__doc__ = "Convert association list, i.e. [(x, y)] list of tuples of key and value, into a dict"

In [None]:
# creating the wordcloud

wordcloud = WordCloud(width = 1000, height = 500, background_color="white").generate_from_frequencies(dict_from_asslist(common_nouns))

plt.figure( figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=10)
plt.show()