# Discord InfoVis

The goal of this small project is to provide some information visualization regarding Discord messages between two individuals. I used [this](https://github.com/Tyrrrz/DiscordChatExporter) tool to obtain a dataset that captures all messages sent between two individuals. It contains the following columns: (author, date, content, attachments, reactions).

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import calendar

from matplotlib import rcParams
from matplotlib.dates import (YEARLY, DateFormatter,
                              rrulewrapper, RRuleLocator, drange)
from datetime import datetime, timedelta
from adjustText import adjust_text

## Configuration

In [None]:
DATASET_PATH = 'datasets/example.csv'
USERNAME_MYSELF = 'EXAMPLE#6968'
USERNAME_OTHER = 'EXAMPLE#4988'
NAME_MYSELF = 'Example'
NAME_OTHER = 'Example'
COLOR1 = '#73455F'
COLOR2 = '#E6706E'
COLORS = [COLOR1, COLOR2]

# Cleaning

In [None]:
# Import the data
df = pd.read_csv(DATASET_PATH, sep=';', dtype={"Author": str, "Date": str, "Content": str, "Attachments": str, "Unnamed": str})

# Rename some of the columns to my own liking
df.rename(columns={'Author': 'author', 'Date': 'time', 'Content': 'content', 
                   'Attachments': 'attachments', 'Reactions': 'reactions'}, inplace=True)
del df['Unnamed: 5']

# Final day that contains incomplete data
# incomplete = 198
# df.drop(df.tail(incomplete).index,inplace=True)

# Convert date and time to standard datetime object
df['time'] = df['time'].astype('datetime64[ns]')

# Create date column that only captures the date
df['date'] = [row.date() for row in df['time']]

# Fill the NaNs
df['content'].fillna('', inplace=True)
df['attachments'].fillna('', inplace=True)
df['reactions'].fillna('', inplace=True)

# Replace the username by the real names for convenience
df['author'] = df['author'].map({USERNAME_MYSELF: NAME_MYSELF, USERNAME_OTHER: NAME_OTHER})

## Visualization

In [None]:
number_messages_total = len(df)
number_messages_myself = len(df[df['author'] == NAME_MYSELF])
number_messages_other = len(df[df['author'] == NAME_OTHER])

In [None]:
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

In [None]:
# Pie chart displaying message distribution

# Note that images, emotes, videos, links, etc are all counted as messages
share = [number_messages_myself, number_messages_other]
labels = [NAME_MYSELF, NAME_OTHER]
patches, texts, _ = plt.pie(share, autopct=make_autopct(share), explode=(0.025, 0), colors=COLORS, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.title('Number of messages sent per person')
# plt.savefig('output/total_messages_per_person.svg')

In [None]:
# Time period
start_date = df['time'][0]
end_date = df['time'][len(df['time']) - 1]
difference = end_date - start_date

print('Started talking on {0}'.format(start_date))
print('This data has been captured until {0}'.format(end_date))
print('That is a total of {0}'.format(difference))

In [None]:
# Group the data per date and see how many messages there were each day
df_groupedby_date = df.groupby('date').count()
df_groupedby_date.reset_index(inplace=True)

In [None]:
# Rule so that we only display it for each month
rule = rrulewrapper(YEARLY, bymonthday=1)
loc = RRuleLocator(rule)
formatter = DateFormatter('%m/%y')

fig, ax = plt.subplots()
plt.plot_date(df_groupedby_date['date'], df_groupedby_date['content'], 'o', label='Data', color=COLOR2, alpha=0.5)

y = df_groupedby_date['content'].sum() / len(df_groupedby_date)
y_mean = [y] * len(df_groupedby_date)
mean_line = ax.plot(df_groupedby_date['date'], y_mean, label='Mean', linestyle='--', color=COLOR1)

plt.title('Number of messages sent per day')
plt.xlabel('Day')
plt.ylabel('Number of messages')
legend = ax.legend(loc='upper right')
ax.xaxis.set_major_locator(loc)
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_tick_params(rotation=75)
# plt.savefig('output/total_messages_per_day.svg')

In [None]:
# Text by month of the year
df['month_year'] = [row.date().strftime('%m/%y') for row in df['time']]
g = sns.catplot(x='month_year', kind='count', data=df, color=COLOR2)
plt.margins(x=0)
plt.title('Number of messages sent per month')
g.set(xlabel='Month/year combination', ylabel='Number of messages')
g.set_xticklabels(rotation=75)
# g.savefig('output/total_messages_per_month.svg')

In [None]:
# Text by month of the year
# Disregarding their time zone because everyone hate timezones
g = sns.catplot(x='month_year', kind='count', hue='author', data=df, legend=False, palette=sns.color_palette(COLORS))
plt.margins(x=0)
plt.legend([NAME_MYSELF, NAME_OTHER])
plt.title('Number of messages sent per month')
g.set(xlabel='Month/year combination', ylabel='Number of messages')
g.set_xticklabels(rotation=75)
# g.savefig('output/individual_messages_per_month.svg')

In [None]:
# Text by day of the week
df['weekday'] = [row.day_name() for row in df['time']]
g = sns.catplot(x='weekday', kind='count', data=df, color=COLOR2)
plt.margins(x=0)
plt.title('Number of messages sent per day of the week')
g.set(xlabel='Day of the week', ylabel='Number of messages')
g.set_xticklabels(rotation=75)
# g.savefig('output/total_messages_per_weekday.svg')

In [None]:
# Text by day of the week (me and other)
# Disregarding their time zone because complicated
g = sns.catplot(x='weekday', kind='count', hue='author', data=df, legend=False, palette=sns.color_palette(COLORS))
plt.margins(x=0)
plt.title('Number of messages sent per day of the week')
plt.legend([NAME_MYSELF, NAME_OTHER])
g.set(xlabel='Day of the week', ylabel='Number of messages')
g.set_xticklabels(rotation=75)
# g.savefig('output/individual_messages_per_weekday.svg')

In [None]:
# Text by time of the day
# >30 minutes, rounded up AND <= 30 minutes, rounded down
df['hour'] = [row.hour for row in df['time']]
df.sort_values(by=['hour'])
g = sns.catplot(x='hour', kind='count', data=df, color=COLOR1)
plt.margins(x=0)
plt.title('Number of messages sent per hour (GMT+2)')
g.set(xlabel='Hour of the day', ylabel='Number of messages')
# g.savefig('output/total_messages_per_hour.svg')

In [None]:
# Text by time of the day
# >30 minutes, rounded up AND <= 30 minutes, rounded down
df['hour'] = [row.hour for row in df['time']]
df.sort_values(by=['hour'])
g = sns.catplot(x='hour', kind='count', hue='author', data=df, legend=False, palette=sns.color_palette(COLORS))
plt.margins(x=0)
plt.legend(loc='upper left', labels=[NAME_MYSELF, NAME_OTHER])
plt.title('Number of messages sent per hour (GMT+2)')
g.set(xlabel='Hour of the day', ylabel='Number of messages')
# g.savefig('output/individual_messages_per_hour_2.svg')

In [None]:
df_nz = df.copy()
df_nz['time'] = [row + timedelta(hours=10) for row in df_nz['time']]
df_nz['hour'] = [row.hour for row in df_nz['time']]
df_nz.sort_values(by=['hour'])
g = sns.catplot(x='hour', kind='count', hue='author', data=df_nz, legend=False, palette=sns.color_palette(COLORS))
plt.margins(x=0)
plt.legend(loc='upper left', labels=[NAME_MYSELF, NAME_OTHER])
plt.title('Number of messages sent per hour (GMT+12)')
g.set(xlabel='Hour of the day', ylabel='Number of messages')
# g.savefig('output/individual_messages_per_hour_12.svg')

In [None]:
# Compute the text length excluding whitespaces
df_text_length = df.copy()
df_text_length['content_length'] = [(len(row) - row.count(' ')) for row in df_text_length['content']]

In [None]:
# Sentence length (<50) between each person
ax = sns.violinplot(x='content_length', y='author', data=df_text_length[df_text_length.content_length <= 50], palette=sns.color_palette(COLORS))
plt.xlabel('Message length')
plt.ylabel('Person')
plt.title('Message length sent by person (<50 characters)')
# plt.savefig('output/individual_length_of_messages.svg')

In [None]:
rule = rrulewrapper(YEARLY, bymonthday=1)
loc = RRuleLocator(rule)
formatter = DateFormatter('%m/%y')
fig = plt.figure(figsize=(12,6))
ax = sns.scatterplot(x='date', y='content_length', hue='author', data=df_text_length, alpha=0.8, legend='full', palette=sns.color_palette(COLORS))
plt.xlabel('Month/year combination')
plt.ylabel('Message length')
plt.title('Message length per date')
ax.set(xlim=(df_text_length['date'].min() - timedelta(days=5), df_text_length['date'].max() + timedelta(days=5)))
ax.xaxis.set_major_locator(loc)
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_tick_params(rotation=75, labelsize=10)
# plt.savefig('output/individual_message_length_per_day.png', dpi=500)

In [None]:
# Most commonly used words (50) - https://www.wordclouds.com/
# I like this more because it provides easy ways to configure the cloud
# from collections import Counter 

# most_common_words = df
# most_common_words['content'] = [row + ' , ' for row in most_common_words['content']]
# most_common_words = most_common_words['content'].sum()

# text_file = open("Output.txt", "w")
# text_file.write(most_common_words)
# text_file.close()

# Output file (after putting it in wordclouds) can be found in the output folder

In [None]:
# Number of messages that were media messages (image, video, gif) (NON-LINK)
share = [len(df[df['attachments'] != '']['attachments']), len(df[df['attachments'] == '']['attachments'])]
labels = ['Media message', 'Text message']
patches, texts, _ = plt.pie(share, autopct=make_autopct(share), explode=(0.025, 0), shadow=True, startangle=90, colors=COLORS)
plt.legend(patches, labels, loc='best')
plt.axis('equal')
plt.title('Type of messages sent')
# plt.savefig('output/individual_messages_media.svg')

In [None]:
# Number of messages that were media messages (image, video, gif) (NON-LINK)
share = [len(df[df['reactions'] != '']['reactions']), len(df[df['reactions'] == '']['reactions'])]
labels = ['Messages with at least one reaction', 'Messages with no reaction']
patches, texts, _ = plt.pie(share, autopct=make_autopct(share), explode=(0.025, 0), shadow=True, startangle=90, colors=COLORS)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.title('Number of messages with and without at least one reaction')
# plt.savefig('output/individual_messages_reaction.svg')

In [None]:
df_word = df.copy()
df_word = df_word.assign(content=df.content.str.split(' ')).explode('content').reset_index(drop=True)

# Remove messages with empty content (attachments) or just commas
df_word = df_word.query("content != ''")
df_word = df_word.query("content != ','")

In [None]:
df_word_me = df_word.copy()
df_word_me = df_word_me[df_word_me['author'] == NAME_MYSELF]
df_word_me = df_word_me.groupby('content').count()
df_word_me = df_word_me.sort_values(by=['author'], ascending=False)
df_word_me = df_word_me.reset_index()
df_word_me['person'] = NAME_MYSELF
columns = ['author', 'attachments', 'reactions', 'date', 'month_year', 'weekday', 'hour']
df_word_me.drop(columns, inplace=True, axis=1)
df_word_me.head()

In [None]:
df_word_them = df_word.copy()
df_word_them = df_word_them[df_word_them['author'] == NAME_OTHER]
df_word_them = df_word_them.groupby('content').count()
df_word_them = df_word_them.sort_values(by=['author'], ascending=False)
df_word_them = df_word_them.reset_index()
df_word_them['person'] = NAME_OTHER
df_word_them.drop(columns, inplace=True, axis=1)
df_word_them.head()

In [None]:
result = pd.merge(df_word_me, df_word_them, on='content')

# Compute the total number of occurences
result['total'] = [row['time_x'] + row['time_y'] for _, row in result.iterrows()]
result = result.sort_values(by=['total'], ascending=False)
# Take the first 1000 entries 
# These are the 1000 most common words sorted on the number of times that word has been used (maybe a bit unfair since I talk more than them)
result = result[:1000]
result.head()

In [None]:
g, ax = plt.subplots(figsize=(20,10))
ax.scatter(x=result['time_y'], y=result['time_x'], alpha=0.5, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result['time_y'][i], result['time_x'][i], result['content'][i], ha='center', va='center') for i in range(0, 100)]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by absolute frequency of use')
# plt.savefig('output/word_usage_frequency_absolute.svg')

In [None]:
# Remove some of the most used words in order to get a better visual on the words less commonly used
result = result[(result['time_x'] < 4000) & (result['time_y'] < 4000)]
result = result.reset_index()

g, ax = plt.subplots(figsize=(20,10))
ax.scatter(x=result['time_y'], y=result['time_x'], alpha=0.5, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result['time_y'][i], result['time_x'][i], result['content'][i], ha='center', va='center') for i in range(0, 150)]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by absolute frequency (<4000 occurrences) of use')
plt.savefig('output/word_usage_frequency_absolute_less_4000.svg')

In [None]:
result = pd.merge(df_word_me, df_word_them, on='content')
# Compute the total number of occurences
result['total'] = [row['time_x'] + row['time_y'] for _, row in result.iterrows()]
result = result.sort_values(by=['total'], ascending=False)
# Take the first 1000 entries 
# These are the 1000 most common words sorted on the number of times that word has been used (maybe a bit unfair since I talk more than them)
result = result[:1000]
result.head()

In [None]:
g, ax = plt.subplots(figsize=(20,10))
ax.scatter(x=result['time_y'], y=result['time_x'], alpha=0.5, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result['time_y'][i], result['time_x'][i], result['content'][i], ha='center', va='center') for i in range(0, 100)]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by absolute frequency (<1000 occurrences) of use')
# plt.savefig('output/word_usage_frequency_less_1000.svg')

In [None]:
x_bound = 20
y_bound = 200

# Not limited to top 1000 words anymore
result = pd.merge(df_word_me, df_word_them, on='content')
result = result[(result['time_x'] < x_bound) & (result['time_y'] > y_bound)]
result = result.reset_index()

g, ax = plt.subplots(figsize=(20,10))
ax.scatter(x=result['time_y'], y=result['time_x'], alpha=0.5, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result['time_y'][i], result['time_x'][i], result['content'][i], ha='center', va='center') for i in range(0, len(result))]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by absolute frequency (less than {0} times by {1}, more than {2} times by {3}) of use'.format(x_bound, NAME_MYSELF, y_bound, NAME_OTHER))
# plt.savefig('output/word_usage_frequency_absolute_less_20_more_200.svg')

In [None]:
result = pd.merge(df_word_me, df_word_them, on='content')

# Compute the total number of occurences
result['total'] = [row['time_x'] + row['time_y'] for _, row in result.iterrows()]
result = result.sort_values(by=['total'], ascending=False)
# Take the first 1000 entries 
# These are the 1000 most common words sorted on the number of times that word has been used (maybe a bit unfair since I talk more than them)
result = result[:1000]

# More interesting to compare it (times wise)
# Tried percentage, but you only get a diagonal line then, not as interesting
result['times_self'] = [row['time_x'] / row['time_y'] for _, row in result.iterrows()]
result['times_other'] = [row['time_y'] / row['time_x'] for _, row in result.iterrows()]

result['perc_self'] = [row['time_x'] / (row['time_y'] + row['time_x']) * 100 for _, row in result.iterrows()]
result['perc_other'] = [row['time_y'] / (row['time_x'] + row['time_y']) * 100 for _, row in result.iterrows()]

# Sort by total number of occurences
result['total'] = [row['time_x'] + row['time_y'] for _, row in result.iterrows()]
result = result.sort_values(by=['total'], ascending=False)

result.describe()

In [None]:
result_copy = result.copy()[:100].reset_index()

g, ax = plt.subplots(figsize=(25,15))
sns.scatterplot(x='times_other', y='times_self', size='total', data=result_copy, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result_copy['times_other'][i], result_copy['times_self'][i], result_copy['content'][i]) for i in range(0, len(result_copy))]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by relative frequency (times) of use')
# plt.savefig('output/word_usage_frequency_times.svg')

In [None]:
result_copy = result.copy()[:100]
# Zoom in on the smaller percentages
result_copy = result_copy[(result_copy['times_self'] < 4.5) & (result_copy['times_other'] < 10)].reset_index()

g, ax = plt.subplots(figsize=(20,15))
sns.scatterplot(x='times_other', y='times_self', size='total', data=result_copy, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result_copy['times_other'][i], result_copy['times_self'][i], result_copy['content'][i]) for i in range(0, len(result_copy))]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by relative frequency of use (<5 times)')
# plt.savefig('output/word_usage_frequency_times_less_5.svg')

In [None]:
result_copy = result.copy()[:100].reset_index()

g, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='perc_other', y='perc_self', size='total', sizes=(20, 200), data=result_copy, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result_copy['perc_other'][i], result_copy['perc_self'][i], result_copy['content'][i]) for i in range(0, len(result_copy))]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by frequency in percentages of use')
# plt.savefig('output/word_usage_frequency_relative.svg')

In [None]:
result_copy = result.copy()[:100]
result_copy = result_copy[(result_copy['perc_other'] < 55) & (result_copy['perc_self'] < 55)].reset_index()
# result_copy = result_copy.reset_index()

g, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='perc_other', y='perc_self', size='total', sizes=(20, 200), data=result_copy, color=COLOR2)
diag_line = ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', color=COLOR1)

texts = [ax.text(result_copy['perc_other'][i], result_copy['perc_self'][i], result_copy['content'][i]) for i in range(0, len(result_copy))]
adjust_text(texts)

plt.xlabel(NAME_OTHER)
plt.ylabel(NAME_MYSELF)
plt.title('Word usage by frequency in percentages of use (<60%)')
# plt.savefig('output/word_usage_frequency_relative_less_60%.svg')