In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from openpyxl.styles.builtins import total
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
from feature_selectors import *

# Suppress the specific warning
import warnings

warnings.filterwarnings("ignore", category = UserWarning)

In [None]:
gebruikers_df = pd.read_csv(r'..\..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Remove alle id's
to_remove = ["EvenementID", "GebruikerID", "Evenement_OrganisatorID"]
df.drop(columns = to_remove, inplace = True)

df

In [None]:
date_columns = ['Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

# DateTime columns
for col in date_columns:
    df[col] = pd.to_datetime(df[col], dayfirst = True)
    
    # # Days
    # # Date since earliest date
    # df[col] = (df[col] - df[col].min()).dt.total_seconds() / (24 * 3600)
    # df[f'Long_ago_{col}'] = df[col] > df[col].mean()
    # df[f'Recent_{col}'] = df[col] < df[col].mean()
    
    # Split on weekend/weekday
    df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    
    # # Months
    # # Split on month number (change to name)
    # df = pd.concat([df, pd.get_dummies(df[col].dt.month_name(), prefix = f'{col}_month')], axis = 1)
    
    # Split on seasons
    df[f'{col}_is_spring'] = (df[col].dt.month >= 3) & (df[col].dt.month <= 5)
    df[f'{col}_is_summer'] = (df[col].dt.month >= 6) & (df[col].dt.month <= 8)
    df[f'{col}_is_autumn'] = (df[col].dt.month >= 9) & (df[col].dt.month <= 11)
    df[f'{col}_is_winter'] = (df[col].dt.month >= 12) | (df[col].dt.month <= 2)
    
    df.drop(columns = col, inplace = True)

df

In [None]:
# Dummies from studierichting
df = pd.concat([df, pd.get_dummies(df['Gebruiker_Studierichting'], prefix = 'Gebruiker_Studierichting')], axis=1)
df.drop(['Gebruiker_Studierichting'], axis = 1, inplace = True)

# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

# Object (string) columns
for col in ['Aanwezigheidsstatus', 'Gebruiker_Lidmaatschapstype']:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")
        

# Aanwezigheidsstatus (1 = aanwezig, 0 = afwezig)
df['Aanwezigheidsstatus'] = ~df['Aanwezigheidsstatus'].replace(2, 1).astype('bool')

df

In [None]:
# # Remove afwezige entries
# df = df[~(df['Aanwezigheidsstatus'] == False)]
# df

In [None]:
# Define predictors (dimensions)
predictor_df = pd.DataFrame(df).drop(columns = ['Evenement_EvenementType'])
outcome_df = pd.DataFrame(df['Evenement_EvenementType'])

In [None]:
# Scale data
scaler = StandardScaler()

scaled_data = scaler.fit_transform(predictor_df)
scaled_df = pd.DataFrame(scaled_data, columns = predictor_df.columns)

scaled_df

In [None]:
# Only train on subset, keep test_x to make predictions and evaluate
from sklearn.model_selection import train_test_split

x = scaled_df

train_x, test_x = train_test_split(x, train_size = 0.85, shuffle = True, random_state = 0)

train_x

In [None]:
states = 100
max_n = 10 # Best N is afhankelijk van max_n? i am confusion
best_n_frequency = {}

# For each randomstate
for state in range(states):
    # Save score on each N
    inertias = [KMeans(n_clusters = n, random_state = state).fit(train_x).inertia_ for n in range(1, max_n)]

    # Determine N with best score
    knee_locator = KneeLocator(range(1, max_n), inertias, curve = "convex", direction = "decreasing")
    best_n = knee_locator.knee

    if best_n is not None:
        # Add a frequency point to that N
        best_n_frequency[best_n] = best_n_frequency.get(best_n, 0) + 1

# Determine most frequent best N
most_frequent_best_n = max(best_n_frequency, key = best_n_frequency.get)

# Plot results
print(f'Most frequent best K: {most_frequent_best_n}')
plt.bar(best_n_frequency.keys(), best_n_frequency.values())
plt.xlabel('N')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Train model one last time with most frequent best N
kmeans = KMeans(n_clusters = most_frequent_best_n, random_state = 0).fit(train_x)

In [None]:
# Showing distribution of cluster sizes
unique_clusters, counts = np.unique(kmeans.labels_, return_counts = True)

print("\nCluster Size Distribution:")
for cluster, count in zip(unique_clusters, counts):
    print(f"Cluster {cluster}: {count} entries")

In [None]:
# Write clusters to entries
train_x['Cluster'] = kmeans.labels_
train_x = train_x.merge(outcome_df[['Evenement_EvenementType']], left_index = True, right_index = True, how = 'inner')
train_x

In [None]:
# Calculate total occurrences of each event type across all clusters
filtered_outcome_df = outcome_df[outcome_df.index.isin(train_x.index)]
total_event_counts = filtered_outcome_df['Evenement_EvenementType'].value_counts()

# Calculate event type frequencies per cluster (as fractions)
cluster_event_summary = (
    train_x.groupby('Cluster')['Evenement_EvenementType']
    .value_counts()
    .unstack(fill_value = 0)  # Pivot to have event types as columns
    .reset_index()
)

# Normalize the frequencies by the total occurrences of each event type
for event in ['bbq', 'lezing', 'hackathon', 'workshop']:
    cluster_event_summary[event] = cluster_event_summary[event] / total_event_counts[event]

# Rank events within each cluster
cluster_event_summary['Event_Rankings'] = cluster_event_summary.loc[:, ['bbq', 'lezing', 'hackathon', 'workshop']].apply(
    lambda row: row.sort_values(ascending = False).index.tolist(), axis = 1
)

print(total_event_counts)
cluster_event_summary

In [None]:
results_list = []

for i in range(0, test_x.shape[0]):
    entry = test_x.iloc[[i]]
    index = entry.index[0]

    # Predict the cluster for the new entry
    predicted_cluster = kmeans.predict(entry)[0]

    # Retrieve event rankings for the predicted cluster
    event_rankings = cluster_event_summary.loc[
        cluster_event_summary["Cluster"] == predicted_cluster, "Event_Rankings"
    ].values

    # Get actual event type of the entry
    true_y = df.iloc[index]['Evenement_EvenementType']

    results_list.append({
        'Index': index,
        'Cluster': predicted_cluster,
        'Predicted': event_rankings[0],
        'True': true_y,
        'Correct': event_rankings[0][0] == true_y
    })

results_df = pd.DataFrame(results_list)

results_df.set_index('Index', inplace = True)
print(
    f"Correct predictions: {results_df['Correct'].sum()}\t({(results_df['Correct'].sum() / results_df.shape[0]) * 100}%)")

results_df