### Let"s create a visualisation for the groupschat of a field hockey team :)

Import the used packages.

In [None]:
from pathlib import Path
from loguru import logger
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tomllib
import numpy as np
import openpyxl

Use the same way as the notebook in the course folder to get the data.

In [None]:
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
raw = root / Path(config["raw"])
wife_file = processed / config["wife_file"]


In [None]:
wife_df = pd.read_parquet(wife_file)
wife_df["has_image"] = wife_df["message"].fillna("").str.contains("<Media weggela").astype(int)
wife_df.head()

Aantal berichten per maand visualiseren.

In [None]:
filtered_emoji_df = wife_df[wife_df["has_emoji"] == True]
filtered_images_df = wife_df[wife_df["has_image"] == 1]

emojis_mean = wife_df.groupby(wife_df["timestamp"].dt.to_period("M"))["has_emoji"].mean()
photo_mean = wife_df.groupby(wife_df["timestamp"].dt.to_period("M"))["has_image"].mean()

plt.figure(figsize=(20, 6))
ax = plt.gca()

photo_mean.plot(kind="line", ax=ax)
highlight_date = pd.Timestamp("2024-06-07")
highlight_value = photo_mean.loc[highlight_date.to_period("M")]
ax.annotate('Huwelijk', xy=(highlight_date, highlight_value), xytext=(highlight_date, highlight_value),
            arrowprops=dict(facecolor='black', shrink=0.05))

highlight_date = pd.Timestamp("2022-11-17")
highlight_value = photo_mean.loc[highlight_date.to_period("M")]
ax.annotate('Geboorte Dochter', xy=(highlight_date, highlight_value), xytext=(highlight_date, highlight_value),
            arrowprops=dict(facecolor='black', shrink=0.05))

# Schaduw elke november grijs
for year in wife_df["timestamp"].dt.year.unique():
    ax.axvspan(pd.Timestamp(f"{year}-11-01"), pd.Timestamp(f"{year}-11-30"), color='gray', alpha=0.3)

plt.title("Aantal berichten per maand")
plt.xlabel("Maand")
plt.ylabel("Aantal berichten")
plt.show()

In [None]:
photos_percentage_per_month = wife_df.groupby(wife_df["timestamp"].dt.to_period("Q"))["has_image"].mean() * 100
dates = wife_df['timestamp'].dt.date
min_date = dates.min()
max_date = dates.max()
number_of_days = (max_date - min_date).days
average = len(wife_df) / number_of_days

plt.figure(figsize=(20, 10))
ax = plt.gca()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))


colors = ['hotpink' if period >= pd.Period('2022-Q3', freq='Q') else 'silver' for period in photos_percentage_per_month.index]
photos_percentage_per_month.plot(kind="bar", ax=ax, color=colors)
for p in ax.patches:
    ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.suptitle('" Photoboom "', size=30, fontweight = 'bold' )
plt.title('The percentage of photos increased drastically after the birth of our daughter', fontstyle='italic', pad = 25)
plt.xlabel("")
plt.xticks(rotation=0)
plt.figtext(0.1, 0.05, f'Gebaseerd op {wife_df.shape[0]:,}'.replace(',','.') + f' berichten tussen {min_date.strftime("%d-%m-%Y")} en {max_date.strftime("%d-%m-%Y")}.'  )
plt.show()

In [None]:
# Groeperen per periode en Author
photos_percentage_per_month = wife_df.groupby([wife_df["timestamp"].dt.to_period("Q"), "author"])["has_image"].mean() * 100

sudsy_mole_df = wife_df[wife_df["author"] == "sudsy-mole"]
photos_percentage_per_man = sudsy_mole_df.groupby(sudsy_mole_df["timestamp"].dt.to_period("Q"))["has_image"].mean() * 100

kaleidoscopic_ferret_df = wife_df[wife_df["author"] == "kaleidoscopic-ferret"]
photos_percentage_per_wife = kaleidoscopic_ferret_df.groupby(kaleidoscopic_ferret_df["timestamp"].dt.to_period("Q"))["has_image"].mean() * 100

# Converteer naar een DataFrame met Author als kolommen en de periodes als index
photos_percentage_per_month = photos_percentage_per_month.unstack(fill_value=0)

# Bereken de datums en statistieken
dates = wife_df['timestamp'].dt.date
min_date = dates.min()
max_date = dates.max()
number_of_days = (max_date - min_date).days
average = len(wife_df) / number_of_days

# Maak de grafiek
plt.figure(figsize=(20, 10))
ax = plt.gca()

# Stapel de gegevens per Author in een bar chart
# photos_percentage_per_man.plot(kind="bar", stacked=True, ax=ax, figsize=(20, 10))
# photos_percentage_per_wife.plot(kind="bar", stacked=True, ax=ax, figsize=(20, 10))
photos_percentage_per_month.plot(kind="bar", stacked=True, ax=ax, figsize=(20, 10))


# Titel en labels aanpassen
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))
plt.suptitle('" Photoboom "', size=30, fontweight='bold')
plt.title('The percentage of photos increased drastically after the birth of our daughter', fontstyle='italic', pad=25)
plt.xlabel("")
plt.xticks(rotation=0)

# Voeg tekst toe onderaan de grafiek
plt.figtext(0.1, 0.05, f'Gebaseerd op {wife_df.shape[0]:,}'.replace(',','.') + f' berichten tussen {min_date.strftime("%d-%m-%Y")} en {max_date.strftime("%d-%m-%Y")}.', fontsize=16)

# Toon de grafiek
plt.tight_layout()
plt.show()

# Print de statistieken
print(number_of_days)
print(average)
