### Let"s create a visualisation for the groupschat of a field hockey team :)

Import the used packages.

In [None]:
from pathlib import Path
from loguru import logger
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tomllib
import numpy as np
import openpyxl

Use the same way as the notebook in the course folder to get the data.

In [None]:
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
raw = root / Path(config["raw"])
datafile = processed / config["current"]
role_file = raw / config["player_file"]


In [None]:
merged_df = pd.read_parquet(datafile)
merged_df.dtypes
merged_df.head()

Inladen van de Player Role json

In [None]:
player_roles = pd.read_json(player_file, encoding = "latin")
player_roles.head()

Merge dataframes

In [None]:
merged_df = pd.merge(merged_df, player_roles, left_on='author', right_on='Author')
merged_df[:100]

Toevoegen van specificatie of iemand een speler is of onderdeel van de staf. Bij de spelers voegen we toe waar ze staan.

In [None]:
namen_staff = [
"bellowing-tarsier",
"carefree-kouprey",
"frolicsome-whistling duck",
"glittering-eland",
"groovy-ostrich",
"loony-penguin",
"mirthful-louse",
"motley-fox",
"patchwork-gerbil",
"rib-tickling-hamster",
"roaring-cassowary",
"sparkling-sand dollar",
"vivacious-dogfish",
"wacky-hummingbird"
]
merged_df["Function"] = merged_df["author"].apply(
    lambda
      x: "Staff" if any(name in x for name in namen_staff) else "Player"
)
merged_df.head()

Aantal berichten per maand visualiseren.

In [None]:
messages_per_month = merged_df.groupby(merged_df["timestamp"].dt.to_period("M")).size()

plt.figure(figsize=(20, 6))
messages_per_month.plot(kind="line")
plt.title("Aantal berichten per maand")
plt.xlabel("Maand")
plt.ylabel("Aantal berichten")
plt.show()

Voeg verschillende kolommen toe aan de data, met name kolommen gerelateerd aan datetime, maar ook een kolom die kijkt of er media is gestuurd met het bericht.

In [None]:
merged_df["day_of_month"] = merged_df["timestamp"].dt.day
merged_df["day"] = merged_df["timestamp"].dt.day_name()
merged_df["month_number"] = merged_df["timestamp"].dt.month
merged_df["month_name"] = merged_df["timestamp"].dt.month_name()
merged_df["year"] = merged_df["timestamp"].dt.year
merged_df["has_image"] = merged_df["message"].str.contains("<Media weggelaten>").astype(int)
merged_df.head()

Maak visualisatie van de gemiddelde lengte van de berichten afhangende van de functie

In [None]:
merged_df["message_length"] = merged_df["message"].str.len()
player_message_count = merged_df[merged_df['Function'] == 'Player'].count()['message']
staff_message_count = merged_df[merged_df['Function'] == 'Staff'].count()['message']

p1 = (
    merged_df[["Function", "message_length"]]
    .groupby("Function")
    .mean()
    .sort_values("message_length", ascending=False)
)

sns.barplot(x=p1.index, y=p1["message_length"], palette = ["red", "lightgrey"]  )
for i, v in enumerate(p1["message_length"]):
    plt.text(i, v*0.98, f'{v:.1f}', ha='center', va='top', fontsize=12)
plt.xlabel("Function within team")
plt.ylabel("Average Message length")
plt.title("Staff members sending longer messages")


plt.figtext(0.05, -0.05, f"Gebaseerd op {player_message_count:,}".replace(',', '.') + f" berichten van de players en {staff_message_count:,}".replace(',', '.') +f" berichten van de staff.", 
            ha='left', va='center', fontsize=8, fontstyle= "italic")

plt.tight_layout() 
plt.subplots_adjust(bottom=0.1) 

Show number of messages per year.

In [None]:
no_of_messages = (
    merged_df[["Function", "message", "has_image"]]
    .groupby("Function")
    .agg(
        no_of_messages=("message","count"),
        no_of_images=("has_image", "sum")
    )
    .sort_values("Function", ascending=True)
)
no_of_messages[ "percentage_with_images"] = (no_of_messages['no_of_images'] / no_of_messages['no_of_messages'] * 100)

no_of_messages

Count the number of messages per day and group them.

In [None]:
month_day_count = merged_df.groupby(["month_name", "month_number", "day_of_month"]).size().unstack(fill_value = 0)
year_month_day_count = merged_df.groupby(["year", "month_name", "month_number", "day_of_month"]).size().unstack(fill_value = 0)
month_day_count.head()
year_month_day_count.head()
# average_per_day = month_day_count.mean(axis=0)

In [None]:
years = year_month_day_count.index.get_level_values("year").unique()

for year in years:
    data_for_year = year_month_day_count.loc[year]
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(data_for_year, cmap="YlGnBu", annot=True, fmt="d")
    plt.title(f"Heatmap van tellingen voor jaar {year}")
    plt.xlabel("Dag van de maand")
    plt.ylabel("Maand")
    plt.show()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(month_day_count, annot=True, fmt="d", linewidths=0.5, cmap="GnBu")
plt.xticks(
    ticks=range(31),
    labels=range(1, 32)
)

plt.yticks(
    ticks=range(12),
    labels=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
    rotation=0
)

plt.title("Heatmap van verzonden berichten per dag per maand")
plt.xlabel("Dag van de maand")
plt.ylabel("Maand")
plt.show()