In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn.neighbors import NearestNeighbors
from scipy.stats import percentileofscore

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')

all_df = [evenement_df, bericht_df, aanwezigheid_df]

In [None]:
# Convert all ID and datetime columns
datetime_cols = ['RegistratieDatum', 'LaatsteLogin', 'EvenementDatum', 'BerichtDatum']
id_cols = ['GebruikerID', 'EvenementID', 'OrganisatorID', 'BerichtID']

for df in all_df:
    for col in df.columns:
        if col in datetime_cols:
            df[col] = pd.to_datetime(df[col], dayfirst = True)

        elif col in id_cols:
            df[col] = df[col].str[1:].astype(int)

In [None]:
# Map sentiment to values
sentiment_map = {'Negatief': -1,
                 'Neutraal': 0,
                 'Positief': 1}
bericht_df['SentimentScore'] = bericht_df['BerichtSentiment'].map(sentiment_map)

# Multiply sentiment with OntvangenLikes
bericht_df['WeightedSentiment'] = bericht_df['SentimentScore'] * bericht_df['OntvangenLikes']

# Aggregate sentiment per event
sentiment_per_event_df = bericht_df.groupby('EvenementID')['WeightedSentiment'].sum().reset_index()

sentiment_per_event_df

In [None]:
# Split aanwezigheidsstatus into relevant columns
aanwezigheid_df['Geregistreerd'] = aanwezigheid_df['Aanwezigheidsstatus'] == 'Geregistreerd'
aanwezigheid_df['Bijgewoond'] = aanwezigheid_df['Aanwezigheidsstatus'] == 'Bijgewoond'

# Count totals
aanwezigheid_count_df = aanwezigheid_df.groupby('EvenementID').agg(
    Bijgewoond_freq = ('Bijgewoond', 'sum'),
    Geregistreerd_freq = ('Geregistreerd', 'sum')
).reset_index()

aanwezigheid_count_df['Geregistreerd_freq'] += aanwezigheid_count_df['Bijgewoond_freq']

aanwezigheid_count_df

In [None]:
# Merge with events
evenement_df = evenement_df.merge(sentiment_per_event_df, on = 'EvenementID', how = 'left') \
    .merge(aanwezigheid_count_df, on = 'EvenementID', how = 'left')

evenement_df

In [None]:
# Select demographic features
predictors = ['EvenementType']

# Prepare training data
training_df = pd.get_dummies(evenement_df[predictors])
training_df['EvenementDatum_month_sin'] = np.sin(2 * np.pi * evenement_df['EvenementDatum'].dt.month / 12)
training_df['EvenementDatum_month_cos'] = np.cos(2 * np.pi * evenement_df['EvenementDatum'].dt.month / 12)
training_df['EvenementDatum_day_isweekend'] = evenement_df['EvenementDatum'].dt.weekday >= 5

# Train a KNN model for finding similar events
knn = NearestNeighbors(n_neighbors = 5)
knn.fit(training_df)

training_df

In [None]:
# Empty template df
template_df = training_df.iloc[:0].copy()

# New event data
# input_df = pd.DataFrame({
#     'Evenement Type': ['BBQ'],
#     'Evenement Datum': [pd.to_datetime('2023-11-05')],
# })

# Initialize input df
input_df = pd.read_csv(r'..\..\data\Nieuwe evenementen.csv')
input_df['Evenement Datum'] = pd.to_datetime(input_df['Evenement Datum'], dayfirst = True)

# Create empty df where rows will get appended onto
converted_input_df = pd.DataFrame(columns = template_df.columns)

# Iterate over every row of the input, convert it, and append it to converted_input_df
for _, row in input_df.iterrows():
    new_row = template_df.copy()
    
    # Copy values of input row into template row
    # Evenement type
    new_row.loc[0, f'EvenementType_{row['Evenement Type']}'] = True

    # Month
    month = row['Evenement Datum'].month
    new_row.loc[0, ['EvenementDatum_month_sin', 'EvenementDatum_month_cos']] = [
        np.sin(2 * np.pi * month / 12),
        np.cos(2 * np.pi * month / 12)
    ]

    # Weekend
    new_row.loc[0, 'EvenementDatum_day_isweekend'] = row['Evenement Datum'].weekday() >= 5

    # Append new row to converted_input_df
    converted_input_df = pd.concat([converted_input_df, new_row], ignore_index = True)

# Replace NaN with False
input_df = converted_input_df.fillna(False)

input_df

In [None]:
# Find similar events
distances, indices = knn.kneighbors(input_df)

input_df['Nearest neighbours'] = indices.tolist()

input_df

In [None]:
# Function to calculate average of a column
def get_average(neighbours, column_name):
    return evenement_df.iloc[neighbours][column_name].mean()

# Function to calculate a bunch of statistics pew row
def calculate_statistics(row):
    average_weighted_sentiment = row['Avg Weighted Sentiment']
    percentile = 100 - percentileofscore(evenement_df['WeightedSentiment'], average_weighted_sentiment)
    total_events = evenement_df.shape[0] + 1
    rank = int((percentile / 100 * total_events) + 1)

    return average_weighted_sentiment, percentile, total_events, rank

In [None]:
# Calculate averages for each new event, based on nearest neighbours
input_df['Avg Weighted Sentiment'] = input_df['Nearest neighbours'].apply(
    lambda neighbours: get_average(neighbours, 'WeightedSentiment')
)

input_df['Sentiment Rank'] = input_df.apply(
    lambda row: calculate_statistics(row)[3], axis = 1
)

input_df['Avg Bijgewoond'] = input_df['Nearest neighbours'].apply(
    lambda neighbours: get_average(neighbours, 'Bijgewoond_freq')
)

input_df['Avg Geregistreerd'] = input_df['Nearest neighbours'].apply(
    lambda neighbours: get_average(neighbours, 'Geregistreerd_freq')
)

input_df

In [None]:
for i, row in input_df.iterrows():
    print(f'\nRow: {i + 1}')

    average_weighted_sentiment, percentile, total_events, rank = calculate_statistics(row)

    print(
        f'Expected weighted sentiment for this event: {average_weighted_sentiment} (top {percentile}%, would rank {rank} out of {total_events} events)')
    
    average_bijgewoond = row['Avg Bijgewoond']
    average_geregistreerd = row['Avg Geregistreerd']
    attendance = round((average_bijgewoond / average_geregistreerd) * 100, 1)
    
    print(
        f'Expected attendance for this event: {int(average_bijgewoond)} aanwezig, {int(average_geregistreerd)} geregistreerd ({attendance}% attendance)')

In [None]:
# Write data to csv
output_df = pd.read_csv(r'..\..\data\Nieuwe evenementen.csv')

output_df['Sentiment'] = input_df['Avg Weighted Sentiment']
output_df['Rank'] = input_df['Sentiment Rank']

output_df['Aantal geregistreerden'] = input_df['Avg Geregistreerd']
output_df['Aantal aanwezigen'] = input_df['Avg Bijgewoond']
output_df['Aanwezigheidspercentage'] = round(input_df['Avg Bijgewoond'] / input_df['Avg Geregistreerd'] * 100, 1)

output_df.to_csv(r'..\..\data\Nieuwe evenementen.csv', index = False)
output_df