Start

In [None]:
import requests
from pathlib import Path
import pandas as pd
import seaborn as sns
from loguru import logger
import numpy as np


In [None]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )
wa_df = pd.read_parquet(datafile)
# wa_df = wa_df.groupby('author').filter(lambda x: len(x) > 50)

author_message_count = wa_df['author'].value_counts()
# print(author_message_count)
distinct_authors_count = wa_df['author'].nunique()
print(distinct_authors_count)

wa_df.head()

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
raw = root / Path(config["raw"])
datafile = processed / config["current"]
role_file = raw / config["role_file"]
player_roles = pd.read_json(role_file, encoding = "latin")
player_roles.head()

In [None]:
player_roles = pd.read_json(role_file, encoding = "latin")
player_roles.head()

In [None]:
merged_df = pd.merge(wa_df, player_roles, left_on='author', right_on='Author')
merged_df = merged_df.drop(columns=['Author'])
merged_df.head()

In [None]:
authors = list(np.unique(wa_df.author))
print(authors)

In [None]:
positions = list(np.unique(merged_df.Position))
print(positions)

In [None]:
n = 500
min_parts = 2

corpus = {}
for author in authors:
    subset = wa_df[wa_df.author == author].reset_index()
    longseq = " ".join(subset.message)
    # chunk everything into n-sized parts
    parts = [longseq[i : i + n] for i in range(0, len(longseq), n)]
    # keep only parts with more than min_parts
    if len(parts) > min_parts:
        corpus[author] = parts
corpus.keys()

In [None]:
import matplotlib.pyplot as plt
from wa_analysis.data_analysis.model import TextClustering


text = [part for text in corpus.values() for part in text]
wa_labels = [k for k, v in corpus.items() for _ in range(len(v))]

# we set batch to false, because we already batched the data
clustering = TextClustering()
clustering(text=text, k=200, labels=wa_labels, batch=False, method="tSNE")
plt.legend(title="author", bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.xticks([])
plt.yticks([])
plt.title("Distinct authors in the WhatsApp dataset")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.patches as patches
from wa_analysis.data_analysis.model import TextClustering

text = [part for text in corpus.values() for part in text]
wa_labels = [k for k, v in corpus.items() for _ in range(len(v))]

# Custom Palette
unique_labels = list(set(wa_labels))
custom_palette = {label: 'red' if label == 'motley-fox' else 'silver' for label in unique_labels}

clustering = TextClustering()

# Vervang de plot methode in de klasse
def plot(self, X: np.ndarray, labels: list) -> None:
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels, palette=custom_palette, edgecolor='white', linewidth=0.8)

# we set batch to false, because we already batched the data
clustering.plot = plot.__get__(clustering, TextClustering)  
clustering(text=text, k=200, labels=wa_labels, batch=False, method="tSNE")

# Vervang de kleuren
scatter = plt.gca().collections[0]
scatter.set_edgecolor('white')
scatter.set_linewidth(0.5)

# Outliers
rect = patches.Rectangle((-33, -40), 27, 20, linewidth=2, edgecolor='silver', facecolor='none', linestyle='--')
plt.gca().add_patch(rect)
plt.annotate('Voornamelijk inhoud over het verzamelen', xy=(0, -35), xytext=(-33, -19),
             fontsize=9, color='silver')


plt.legend(title="Auteur", bbox_to_anchor=(1.05, 1), loc="upper left", 
           title_fontproperties={'weight': 'bold'})

# plt.xticks([])
# plt.legend().remove()
# plt.yticks([])
plt.suptitle("De coach communiceert duidelijk anders dan de rest", fontsize = 16, fontweight = "bold")
plt.title("en bemoeit zich al helemaal niet met het verzamelen...")
plt.figtext(0.0, -0.05, "Gebaseerd op de top 10 bijdragers aan de WhatsApp groepchat (op basis van aantal berichten).\n Bij het verzamelen wordt meestal één bericht gemaakt, waar de spelers hun eigen naam in zetten en versturen...", wrap=True, horizontalalignment='left', fontsize=10)
plt.show()
plt.savefig("clustering.png")

In [None]:
scatter_data = scatter.get_offsets()
x_coords = scatter_data[:, 0]
y_coords = scatter_data[:, 1]

# Filter de berichten die in de outlier box vallen
outlier_indices = []
for i, (x, y) in enumerate(zip(x_coords, y_coords)):
    if -33 <= x <= -6 and -40 <= y <= -20:  # Gebruik de coördinaten van je box
        outlier_indices.append(i)

# Bekijk de berichten
outlier_messages = [text[i] for i in outlier_indices]
outlier_authors = [wa_labels[i] for i in outlier_indices]

# Eventueel opslaan in een CSV
import pandas as pd
outlier_df = pd.DataFrame({
    'auteur': outlier_authors,
    'bericht': outlier_messages,
    'x': [x_coords[i] for i in outlier_indices],
    'y': [y_coords[i] for i in outlier_indices]
})
outlier_df.to_csv('outlier_berichten.csv', index=False)
print("Outlier berichten opgeslagen in outlier_berichten.csv")