Start

In [None]:
import requests
from pathlib import Path
import pandas as pd
import seaborn as sns
from loguru import logger
import numpy as np


In [None]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )
wa_df = pd.read_parquet(datafile)
wa_df = wa_df.groupby('author').filter(lambda x: len(x) > 50)

author_message_count = wa_df['author'].value_counts()
# print(author_message_count)
distinct_authors_count = wa_df['author'].nunique()
print(distinct_authors_count)

wa_df.head()

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
raw = root / Path(config["raw"])
datafile = processed / config["current"]
role_file = raw / config["role_file"]
player_roles = pd.read_json(role_file, encoding = "latin")
player_roles.head()

In [None]:
player_roles = pd.read_json(role_file, encoding = "latin")
player_roles.head()

In [None]:
merged_df = pd.merge(wa_df, player_roles, left_on='author', right_on='Author')
merged_df = merged_df.drop(columns=['Author'])
merged_df.head()

In [None]:
authors = list(np.unique(wa_df.author))
print(authors)

In [None]:
n = 500
min_parts = 2

corpus = {}
for author in authors:
    subset = wa_df[wa_df.author == author].reset_index()
    longseq = " ".join(subset.message)
    # chunk everything into n-sized parts
    parts = [longseq[i : i + n] for i in range(0, len(longseq), n)]
    # keep only parts with more than min_parts
    if len(parts) > min_parts:
        corpus[author] = parts
corpus.keys()

In [None]:
# import matplotlib.pyplot as plt

# text = [part for text in corpus.values() for part in text]
# wa_labels = [k for k, v in corpus.items() for _ in range(len(v))]
# # we set batch to false, because we already batched the data
# # clustering(text=text, k=200, labels=wa_labels, batch=False, method="tSNE")
# plt.legend(title="Position", bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.xticks([])
# # plt.yticks([])
# plt.title("Distinct authors in the WhatsApp dataset")