### Let"s create a visualisation for the groupschat of a field hockey team :)

Import the used packages.

In [None]:
from pathlib import Path
from loguru import logger
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tomllib
import numpy as np
import openpyxl
import re


Use the same way as the notebook in the course folder to get the data.

In [None]:
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
raw = root / Path(config["raw"])
datafile = processed / config["current"]
role_file = raw / config["role_file"]


In [None]:
merged_df = pd.read_parquet(datafile)

emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)

def count_emojis(text):
    return len(emoji_pattern.findall(text))

merged_df["emoji_count"] = merged_df["message"].apply(count_emojis)


merged_df["message_length"] = merged_df["message"].str.len()
merged_df['prev_author'] = merged_df['author'].shift(1)
merged_df['prev_timestamp'] = merged_df['timestamp'].shift(1)
merged_df['time_since_prev_sec'] = (merged_df['timestamp'] - merged_df['prev_timestamp']).dt.total_seconds()
merged_df['time_since_prev_min'] = (merged_df['timestamp'] - merged_df['prev_timestamp']).dt.total_seconds() / 60
merged_df.head()
print(merged_df.columns)

In [None]:
# author_matrix = merged_df[merged_df['author'] != merged_df['prev_author']].copy()
# author_matrix = merged_df.pivot_table(index='author', columns='prev_author', values='message', aggfunc='count', fill_value=0)

# plt.figure(figsize=(10, 8))
# sns.heatmap(author_matrix, annot=True, fmt="d", cmap="YlGnBu")
# plt.title("Number of Messages from One Author to Previous Author")
# plt.xlabel("Previous Author")
# plt.ylabel("Author")
# plt.show()

In [None]:
player_roles = pd.read_json(role_file, encoding = "latin")
player_roles.head()

In [None]:
merged_df = pd.merge(merged_df, player_roles, left_on='author', right_on='Author')
merged_df = merged_df.drop(columns=['Author'])
merged_df['prev_position'] = merged_df['Position'].shift(1)
print(merged_df.columns)

In [None]:
# df = merged_df[merged_df['author'] != merged_df['prev_author']].copy()
df = merged_df

p = (
    df.groupby(["author", "Position"])
    .agg({"message_length": "mean", "time_since_prev_min": "mean", "has_emoji": "mean", "author": "count"})
    .rename(columns={"author": "count"})
)

p = p[p["count"] > 10]
sns.scatterplot(data=p, x="message_length", y="has_emoji", hue = "Position", alpha=0.5)
print(p.columns)

In [None]:
scatter_plot = sns.scatterplot(data=p, x="message_length", y="has_emoji", size="count", sizes=(10, 500), alpha=0.3, hue="Position", palette="bright")

# Verplaats de legenda buiten de grafiek
scatter_plot.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Position")
print(p.columns)

In [None]:
# Reset de index zodat author en Position gewone kolommen worden
p = p.reset_index()

# Nu kun je gewoon je scatterplot maken
scatter_plot = sns.scatterplot(
    data=p, 
    x="message_length", 
    y="has_emoji", 
    size="count", 
    sizes=(10, 500), 
    alpha=0.5,
    hue="Position",
    palette="Dark2"  # Duidelijkere kleuren
)

# Verwijder de standaard legenda
scatter_plot.legend_.remove()

# Maak een nieuwe legenda die alleen de Position-variabele toont
handles, labels = scatter_plot.get_legend_handles_labels()
num_positions = len(p["Position"].unique())
position_handles = handles[-num_positions:]
position_labels = labels[-num_positions:]

# Voeg de nieuwe legenda toe
scatter_plot.legend(
    position_handles, 
    position_labels, 
    loc='center left', 
    bbox_to_anchor=(1, 0.5), 
    title="Position"
)