In [None]:
import pandas as pd
from feature_selectors import *
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
df = merge_on_all(bericht_df, evenement_df, gebruikers_df, aanwezigheid_df)
# df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)
# df = merge_on_bericht(bericht_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    if col in df.columns:
        df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Convert columns to datetime datatype
date_columns = ['BerichtDatum', 'Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], dayfirst = True)
    
df

In [None]:
dummies_df = pd.DataFrame()

# Object (string) columns
for col in df.select_dtypes(include = ['object']).columns:
    dummies_df = pd.concat([dummies_df, pd.get_dummies(df[col], prefix = col)], axis = 1)

# DateTime columns
for col in df.select_dtypes(include = ['datetime']).columns:
    dummies_df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    dummies_df = pd.concat([dummies_df, pd.get_dummies(df[col].dt.month_name(), prefix = f'{col}_month')], axis = 1)

# Numeric columns
for col in df.select_dtypes(include = ['int64', 'float64']).columns:
    dummies_df[f'High_{col}'] = df[col] > df[col].mode()[0]
    dummies_df[f'Low_{col}'] = df[col] < df[col].mode()[0]

dummies_df

In [None]:
# Create frequent itemsets and association rules
freq_itemsets = apriori(dummies_df, min_support = 0.1, use_colnames = True)
association_rules(freq_itemsets, metric = 'lift', min_threshold = 1.25)