In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.neighbors import NearestNeighbors

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')

all_df = [gebruikers_df, evenement_df, aanwezigheid_df]

In [None]:
# Convert all ID and datetime columns
datetime_cols = ['RegistratieDatum', 'LaatsteLogin', 'EvenementDatum', 'BerichtDatum']
id_cols = ['GebruikerID', 'EvenementID', 'OrganisatorID', 'BerichtID']

for df in all_df:
    for col in df.columns:
        if col in datetime_cols:
            df[col] = pd.to_datetime(df[col], dayfirst = True)

        elif col in id_cols:
            df[col] = df[col].str[1:].astype(int)

In [None]:
# Remove invalid gebruikers
for col in gebruikers_df.columns:
    gebruikers_df = gebruikers_df[~(gebruikers_df[col].isna())]

gebruikers_df = gebruikers_df[~(gebruikers_df['Lidmaatschapstype'] == 'x')]

In [None]:
#------------------------------------------------------------------------------------------------------------------------------------------------------
# # Alleen 'Bijgewoond'
# filtered_events = aanwezigheid_df[aanwezigheid_df['Aanwezigheidsstatus'] == 'Bijgewoond'].copy()

# Alles behalve 'Afwezig' (dus 'Bijgewoond' en 'Geregistreerd')
filtered_events = aanwezigheid_df.copy()
filtered_events = filtered_events[~(filtered_events['Aanwezigheidsstatus'] == 'Afwezig')]
#------------------------------------------------------------------------------------------------------------------------------------------------------

# Group event attendance data by GebruikerID
events_per_user = (
    filtered_events.groupby('GebruikerID')['EvenementID']
    .apply(lambda x: list(x.unique()))
    .reset_index()
    .rename(columns = {'EvenementID': 'Geregistreerde evenementen'})
)

# Merge gebruikers_df with the attended events
gebruikers_df = gebruikers_df.merge(events_per_user, on = 'GebruikerID', how = 'left')

# Fix typo's
gebruikers_df['Studierichting'] = gebruikers_df['Studierichting'].replace('Data Engineering', 'Data engineering')

# Delete entries with no evenementen bijgewoond
gebruikers_df = gebruikers_df[~(gebruikers_df['Geregistreerde evenementen'].isna())]

gebruikers_df

In [None]:
# Select demographic features
features = ['Leeftijd', 'Studierichting', 'Studiejaar', 'Lidmaatschapstype']

# Prepare training data
training_df = pd.get_dummies(gebruikers_df[features])

# Train a KNN model for finding similar users
knn = NearestNeighbors(n_neighbors = 10)
knn.fit(training_df)

training_df

In [None]:
# Empty template df
template_df = training_df.iloc[:0].copy()

# # New user data
# input_df = pd.DataFrame({
#     'Leeftijd': [25],
#     'Studierichting': ['Software engineering'],
#     'Studiejaar': [3],
#     'Lidmaatschapstype': ['Premium']
# })

# Initialize input df
input_df = pd.read_csv(r'..\..\data\Nieuwe gebruikers.csv')

# Create empty df where rows will get appended onto\
converted_input_df = pd.DataFrame(columns = template_df.columns)

# Iterate over every row of the input, convert it, and eppend it to the converted_input_df
for _, row in input_df.iterrows():
    new_row = template_df.copy()

    # Copy values of input row into template row
    # Leeftijd
    new_row.loc[0, 'Leeftijd'] = row.loc['Leeftijd']

    # Studierichting, Studiejaar and Lidmaatschapstype
    new_row.loc[0, [
        f"Studierichting_{row.loc['Studierichting']}",
        f"Studiejaar_{['Eerstejaars', 'Tweedejaars', 'Derdejaars', 'Vierdejaars'][row.loc['Studiejaar'] - 1]}",
        f"Lidmaatschapstype_{row.loc['Lidmaatschapstype']}"
    ]] = True

    # Append new row to converted_input_df
    converted_input_df = pd.concat([converted_input_df, new_row], ignore_index = True)

# Replace NaN with False
input_df = converted_input_df.fillna(False)

input_df

In [None]:
# Find similar users
distances, indices = knn.kneighbors(input_df)

input_df['Nearest neighbours'] = indices.tolist()

input_df

In [None]:
def get_event_type_ranked(neighbours): 
    event_type_counter = {}
    
    for neighbour in neighbours:
        
        # Make sure neighbour exists
        if neighbour in gebruikers_df.index:
        
            # Get the list of events for this neighbour
            registered_events = gebruikers_df.loc[gebruikers_df.index == neighbour, 'Geregistreerde evenementen'].values[0]
    
            # For each event ID, get its type
            for event_id in registered_events:
                event_type = evenement_df.loc[evenement_df['EvenementID'] == event_id, 'EvenementType'].values
    
                if event_type.size > 0:
                    # Tally the event type
                    event_type_counter[event_type[0]] = event_type_counter.get(event_type[0], 0) + 1

    # Return the event types sorted from most to least frequent
    sorted_event_types = sorted(event_type_counter, key = event_type_counter.get, reverse = True)

    return sorted_event_types

In [None]:
input_df[['Evenement 1', 'Evenement 2', 'Evenement 3', 'Evenement 4']] = input_df['Nearest neighbours'].apply(
    lambda neighbours: pd.Series(get_event_type_ranked(neighbours))
)

input_df

In [None]:
# Write data to csv
output_df = pd.read_csv(r'..\..\data\Nieuwe gebruikers.csv')

output_df['Evenement 1'] = input_df['Evenement 1']
output_df['Evenement 2'] = input_df['Evenement 2']
output_df['Evenement 3'] = input_df['Evenement 3']
output_df['Evenement 4'] = input_df['Evenement 4']

output_df.to_csv(r'..\..\data\Nieuwe gebruikers.csv', index = False)
output_df