In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
from feature_selectors import *

# # Suppress the specific warning
# import warnings
# 
# warnings.filterwarnings("ignore", category = UserWarning)

In [None]:
gebruikers_df = pd.read_csv(r'..\..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
# df = merge_on_all(bericht_df, evenement_df, gebruikers_df, aanwezigheid_df)
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)
# df = merge_on_bericht(bericht_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    if col in df.columns:
        df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Convert columns to days since earliest date
date_columns = ['BerichtDatum', 'Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], dayfirst = True)
        df[col] = (df[col] - df[col].min()).dt.total_seconds() / (24 * 3600)

df

In [None]:
# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

# Object (string) columns
for col in df.select_dtypes(include = ['object']).columns:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")
        
# # Change range from -1 (negatief) to 1 (positief)
# df['BerichtSentiment'] = df['BerichtSentiment'].replace(2, -1)

# Aanwezigheidsstatus (1 = aanwezig, 0 = afwezig)
df['Aanwezigheidsstatus'] = ~df['Aanwezigheidsstatus'].replace(2, 1).astype('bool', )

df

In [None]:
# Define predictors (dimensions)
predictors = df.columns.tolist()
# predictors.remove('BerichtID')
predictors.remove('GebruikerID')
predictors.remove('EvenementID')
predictors.remove('Evenement_OrganisatorID')

In [None]:
# Scale data
scaler = StandardScaler()

scaled_data = scaler.fit_transform(df[predictors])
scaled_df = pd.DataFrame(scaled_data, columns = predictors)

In [None]:
states = 100
max_n = 10 # Best N is afhankelijk van max_n? i am confusion
best_n_frequency = {}

# For each randomstate
for state in range(states):
    # Save score on each N
    inertias = [KMeans(n_clusters = n, random_state = state).fit(scaled_df).inertia_ for n in range(1, max_n)]

    # Determine N with best score
    knee_locator = KneeLocator(range(1, max_n), inertias, curve = "convex", direction = "decreasing")
    best_n = knee_locator.knee

    # Add a frequency point to that N
    best_n_frequency[best_n] = best_n_frequency.get(best_n, 0) + 1

# Determine most frequent best N
most_frequent_best_n = max(best_n_frequency, key = best_n_frequency.get)

# Plot results
print(f'Most frequent best K: {most_frequent_best_n}')
plt.bar(best_n_frequency.keys(), best_n_frequency.values())
plt.xlabel('N')
plt.ylabel('Frequency')
plt.show()

# Train model one last time with most frequent best N
kmeans = KMeans(n_clusters = most_frequent_best_n, random_state = 0).fit(scaled_df)

In [None]:
# Showing distribution of cluster sizes
unique_clusters, counts = np.unique(kmeans.labels_, return_counts = True)

print("\nCluster Size Distribution:")
for cluster, count in zip(unique_clusters, counts):
    print(f"Cluster {cluster}: {count} entries")