In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
from feature_selectors import *

In [None]:
gebruikers_df = pd.read_csv(r'..\..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
# df = merge_on_all(bericht_df, evenement_df, gebruikers_df, aanwezigheid_df)
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)
# df = merge_on_bericht(bericht_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    if col in df.columns:
        df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Convert columns to days since earliest date
date_columns = ['BerichtDatum', 'Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], dayfirst = True)
        df[col] = (df[col] - df[col].min()).dt.total_seconds() / (24 * 3600)

df

In [None]:
# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

# Object (string) columns
for col in df.select_dtypes(include = ['object']).columns:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")

# Aanwezigheidsstatus (1 = afwezig, 0 = aanwezig)
df['Aanwezigheidsstatus'] = ~df['Aanwezigheidsstatus'].replace(2, 1).astype('bool', )

# # Change range from -1 (negatief) to 1 (positief)
# df['BerichtSentiment'] = df['BerichtSentiment'].replace(2, -1)

df

In [None]:
# Define predictors (dimensions)
predictors = df.columns.tolist()
# predictors.remove('BerichtID')
predictors.remove('GebruikerID')
predictors.remove('EvenementID')
predictors.remove('Evenement_OrganisatorID')

In [None]:
# Scale data
scaler = StandardScaler()

scaled_data = scaler.fit_transform(df[predictors])
scaled_df = pd.DataFrame(scaled_data, columns = predictors)

In [None]:
from sklearn.model_selection import train_test_split

x = scaled_df[predictors]

train_x, test_x = train_test_split(x, train_size = 0.9, shuffle = True, random_state = 0)

In [None]:
# Silhouette Scores (to find the best number of clusters)
sil_scores = []
n_range = range(2, 50)

# For each N
for n in n_range:
    # Save silhouette score on N
    prediction = AgglomerativeClustering(n_clusters = n, linkage = 'ward').fit_predict(train_x)
    sil_scores.append(silhouette_score(train_x, prediction))

# Determine N with the highest score
best_n_clusters = n_range[np.argmax(sil_scores)]
print(f'Optimal number of clusters (based on silhouette score): {best_n_clusters}')

# Plot results
plt.plot(n_range, sil_scores, marker = 'o')
plt.axvline(x = n_range[np.argmax(sil_scores)], color = 'red', linestyle = '--')
plt.xlabel('N')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# Manually read best N clusters
best_n_clusters = 23

In [None]:
# Plotting the dendrogram and the cutoff line at the best merge distance
plt.figure(figsize = (13, 5))

link = linkage(train_x, method = 'ward')

# Distance just before forming best_n_clusters
best_merge_distance = link[-(best_n_clusters - 1), 2]

dendrogram(
    link,
    # labels = df['name'].values,
    color_threshold = best_merge_distance
)

# Plot a horizontal line at the best merge distance
plt.axhline(y = best_merge_distance, color = 'r', linestyle = '--')

plt.xlabel('Index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Train model one last time based on best N clusters
model = AgglomerativeClustering(n_clusters = best_n_clusters, linkage = 'ward').fit(train_x)

In [None]:
#Showing distribution of cluster sizes
unique_clusters, counts = np.unique(model.labels_, return_counts = True)

print("\nCluster Size Distribution:")
for cluster, count in zip(unique_clusters, counts):
    print(f"Cluster {cluster}: {count} entries")