In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from feature_selectors import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')

In [None]:
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    if col in df.columns:
        df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Factorize
to_factorize = ["Aanwezigheidsstatus", "Gebruiker_Lidmaatschapstype"]

for col in to_factorize:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")


df['Gebruiker_Lidmaatschapstype'] = df['Gebruiker_Lidmaatschapstype'].astype(bool)

# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

# Aanwezigheidsstatus (0 = afwezig, 1 = aanwezig)
df['Aanwezigheidsstatus'] = ~df['Aanwezigheidsstatus'].replace(2, 1).astype('bool', )

df

In [None]:
# Dummies from evenemnttype and studierichting
df = pd.concat([df, pd.get_dummies(df['Evenement_EvenementType'], prefix = 'Evenement_EvenementType')], axis = 1)
df = pd.concat([df, pd.get_dummies(df['Gebruiker_Studierichting'], prefix = 'Gebruiker_Studierichting')], axis = 1)
df.drop(columns = ['Evenement_EvenementType', 'Gebruiker_Studierichting'], axis = 1, inplace = True)

df

In [None]:
# Convert columns to datetime datatype
date_columns = ['Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], dayfirst = True)

# Extract date/month/year from datetime columns
for col in date_columns:
    df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    df[f'{col}_month'] = df[col].dt.month
    # df[f'{col}_year'] = df[col].dt.year

df.drop(columns = date_columns, inplace = True)

df

In [None]:
def get_all_predictors():
    all_predictors = df.columns.tolist()
    all_predictors.remove('EvenementID')
    all_predictors.remove('GebruikerID')
    all_predictors.remove('Evenement_OrganisatorID')

    return all_predictors

In [None]:
model = DecisionTreeClassifier(random_state = 1, max_depth = 1)

outcome = 'Aanwezigheidsstatus'
all_predictors = get_all_predictors()
sub_df = df[df['GebruikerID'] == 1]

X = sub_df[all_predictors].drop(columns = [col for col in sub_df.columns if col.startswith('Gebruiker_')], axis = 1)
X.drop(outcome, axis = 1, inplace = True)

Y = sub_df[outcome]
tree = model.fit(X, Y)

plot_tree(tree, feature_names = X.columns, class_names = list(map(str, tree.classes_)), fontsize = 7)