In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn.neighbors import NearestNeighbors
from scipy.stats import percentileofscore

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')

all_df = [evenement_df, bericht_df, aanwezigheid_df]

In [None]:
# Convert all ID and datetime columns
datetime_cols = ['RegistratieDatum', 'LaatsteLogin', 'EvenementDatum', 'BerichtDatum']
id_cols = ['GebruikerID', 'EvenementID', 'OrganisatorID', 'BerichtID']

for df in all_df:
    for col in df.columns:
        if col in datetime_cols:
            df[col] = pd.to_datetime(df[col], dayfirst = True)

        elif col in id_cols:
            df[col] = df[col].str[1:].astype(int)

In [None]:
# Map sentiment to values
sentiment_map = {'Negatief': -1,
                 'Neutraal': 0,
                 'Positief': 1}
bericht_df['SentimentScore'] = bericht_df['BerichtSentiment'].map(sentiment_map)

# Multiply sentiment with OntvangenLikes
bericht_df['WeightedSentiment'] = bericht_df['SentimentScore'] * bericht_df['OntvangenLikes']

# Aggregate sentiment per event
sentiment_per_event_df = bericht_df.groupby('EvenementID')['WeightedSentiment'].sum().reset_index()

sentiment_per_event_df

In [None]:
# Split aanwezigheidsstatus into relevant columns
aanwezigheid_df['Geregistreerd'] = aanwezigheid_df['Aanwezigheidsstatus'] == 'Geregistreerd'
aanwezigheid_df['Bijgewoond'] = aanwezigheid_df['Aanwezigheidsstatus'] == 'Bijgewoond'

# Count totals
aanwezigheid_count_df = aanwezigheid_df.groupby('EvenementID').agg(
    Bijgewoond_freq=('Bijgewoond', 'sum'),
    Geregistreerd_freq=('Geregistreerd', 'sum')
).reset_index()

aanwezigheid_count_df

In [None]:
# Merge with events
evenement_df = evenement_df.merge(sentiment_per_event_df, on = 'EvenementID', how = 'left') \
    .merge(aanwezigheid_count_df, on = 'EvenementID', how = 'left')

evenement_df

In [None]:
# Select demographic features
predictors = ['EvenementType']

# Prepare training data
training_df = pd.get_dummies(evenement_df[predictors])
training_df['EvenementDatum_month_sin'] = np.sin(2 * np.pi * evenement_df['EvenementDatum'].dt.month / 12)
training_df['EvenementDatum_month_cos'] = np.cos(2 * np.pi * evenement_df['EvenementDatum'].dt.month / 12)

# Train a KNN model for finding similar events
knn = NearestNeighbors(n_neighbors = 5)
knn.fit(training_df)

training_df

In [None]:
# Empty template df
template_df = training_df.iloc[:0].copy()

# New event data
new_event_df = pd.DataFrame({
    'EvenementType': ['BBQ'],
    'EvenementDatum': [pd.to_datetime('2023-11-05')],
})

# Extract month
month = new_event_df['EvenementDatum'].dt.month.iloc[0]

# Update values in template_df
template_df.loc[0, f"EvenementType_{new_event_df.loc[0, 'EvenementType']}"] = True

template_df.loc[0, ['EvenementDatum_month_sin', 'EvenementDatum_month_cos']] = [
    np.sin(2 * np.pi * month / 12),
    np.cos(2 * np.pi * month / 12)
]

new_event_df = template_df.fillna(False)

new_event_df

In [None]:
# Find similar users
distances, indices = knn.kneighbors(new_event_df)

# Get similar users' event preferences
similar_events = evenement_df.iloc[indices[0]]
similar_events

In [None]:
average_weighted_sentiment = similar_events['WeightedSentiment'].mean()
percentile = 100 - percentileofscore(evenement_df['WeightedSentiment'], average_weighted_sentiment)
total_events = evenement_df.shape[0]
rank = int((percentile / 100 * total_events) + 1)

print(
    f'Expected weighted sentiment for this event: {average_weighted_sentiment} (top {percentile}%, would rank {rank} out of {total_events} events)')

average_bijgewoond = similar_events['Bijgewoond_freq'].mean()
average_geregistreerd = similar_events['Geregistreerd_freq'].mean() + average_bijgewoond
attendance = round((average_bijgewoond / average_geregistreerd) * 100, 1)

print(
    f'Expected attendance for this event: {int(average_bijgewoond)} aanwezig, {int(average_geregistreerd)} geregistreerd ({attendance}% attendance)')