In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.neighbors import NearestNeighbors

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')

all_df = [gebruikers_df, evenement_df, aanwezigheid_df]

In [None]:
# Convert all ID and datetime columns
datetime_cols = ['RegistratieDatum', 'LaatsteLogin', 'EvenementDatum', 'BerichtDatum']
id_cols = ['GebruikerID', 'EvenementID', 'OrganisatorID', 'BerichtID']

for df in all_df:
    for col in df.columns:
        if col in datetime_cols:
            df[col] = pd.to_datetime(df[col], dayfirst = True)

        elif col in id_cols:
            df[col] = df[col].str[1:].astype(int)

In [None]:
# Remove invalid gebruikers
for col in gebruikers_df.columns:
    gebruikers_df = gebruikers_df[~(gebruikers_df[col].isna())]

gebruikers_df = gebruikers_df[~(gebruikers_df['Lidmaatschapstype'] == 'x')]

In [None]:
#------------------------------------------------------------------------------------------------------------------------------------------------------
# Alleen 'Bijgewoond'
filtered_events = aanwezigheid_df[aanwezigheid_df['Aanwezigheidsstatus'] == 'Bijgewoond'].copy()

# # Alles behalve 'Afwezig' (dus 'Bijgewoond' en 'Geregistreerd')
# filtered_events = aanwezigheid_df.copy()
# filtered_events = filtered_events[~(filtered_events['Aanwezigheidsstatus'] == 'Afwezig')]

#------------------------------------------------------------------------------------------------------------------------------------------------------

# Group event attendance data by GebruikerID
events_per_user = (
    filtered_events.groupby('GebruikerID')['EvenementID']
    .apply(list)
    .reset_index()
    .rename(columns = {'EvenementID': 'Geregistreerde evenementen'})
)

# Merge gebruikers_df with the attended events
gebruikers_df = gebruikers_df.merge(events_per_user, on = 'GebruikerID', how = 'left')

# Fix typo's
gebruikers_df['Studierichting'] = gebruikers_df['Studierichting'].replace('Data Engineering', 'Data engineering')

# Delete entries with no evenementen bijgewoond
gebruikers_df = gebruikers_df[~(gebruikers_df['Geregistreerde evenementen'].isna())]

gebruikers_df

In [None]:
# Select demographic features
predictors = ['Leeftijd', 'Studierichting', 'Studiejaar', 'Lidmaatschapstype']

# Prepare training data
training_df = pd.get_dummies(gebruikers_df[predictors])
training_df['RegistratieDatum_month_sin'] = np.sin(2 * np.pi * gebruikers_df['RegistratieDatum'].dt.month / 12)
training_df['RegistratieDatum_month_cos'] = np.cos(2 * np.pi * gebruikers_df['RegistratieDatum'].dt.month / 12)

# Train a KNN model for finding similar users
knn = NearestNeighbors(n_neighbors = 10)
knn.fit(training_df)

training_df

In [None]:
# Empty template df
template_df = training_df.iloc[:0].copy()

# New user data
new_user_df = pd.DataFrame({
    'RegistratieDatum': [pd.to_datetime('2023-11-05')],
    'Leeftijd': [25],
    'Studierichting': ['Software engineering'],
    'Studiejaar': [3],
    'Lidmaatschapstype': ['Premium']
})

# Extract month
month = new_user_df['RegistratieDatum'].dt.month.iloc[0]

# Update values in template_df
template_df.loc[0, 'Leeftijd'] = new_user_df.loc[0, 'Leeftijd']
template_df.loc[0, [
    f"Studierichting_{new_user_df.loc[0, 'Studierichting']}",
    f"Studiejaar_{['Eerstejaars', 'Tweedejaars', 'Derdejaars', 'Vierdejaars'][new_user_df.loc[0, 'Studiejaar'] - 1]}",
    f"Lidmaatschapstype_{new_user_df.loc[0, 'Lidmaatschapstype']}"
]] = True

template_df.loc[0, ['RegistratieDatum_month_sin', 'RegistratieDatum_month_cos']] = [
    np.sin(2 * np.pi * month / 12),
    np.cos(2 * np.pi * month / 12)
]

new_user_df = template_df.fillna(False)

new_user_df

In [None]:
# Find similar users
distances, indices = knn.kneighbors(new_user_df)

# Get similar users' event preferences
similar_users = gebruikers_df.iloc[indices[0]]
similar_users

In [None]:
# Get recommended event ids and write into dataframe
recommended_event_ids = similar_users['Geregistreerde evenementen'].sum()
recommended_event_ids_df = pd.DataFrame({'EvenementID': recommended_event_ids})

# Get full information on these events by merging with evenementinformatie
recommended_events = recommended_event_ids_df.merge(evenement_df, on = 'EvenementID', how = 'left')

# Count how often each type appears
recommended_events_type_counts = recommended_events['EvenementType'].value_counts()

recommended_events_type_counts