In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

from feature_selectors import *
from sklearn.preprocessing import StandardScaler

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Factorize
to_factorize = ["Aanwezigheidsstatus", "Gebruiker_Lidmaatschapstype"]

for col in to_factorize:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")

df = pd.concat([df, pd.get_dummies(df['Evenement_EvenementType'], prefix = 'Evenement_EvenementType')], axis = 1)
df = pd.concat([df, pd.get_dummies(df['Gebruiker_Studierichting'], prefix = 'Gebruiker_Studierichting')], axis = 1)
df.drop(columns = ['Evenement_EvenementType', 'Gebruiker_Studierichting'], axis = 1, inplace = True)

df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

df

In [None]:
# Convert columns to datetime datatype and extract day/month/year
date_columns = ['Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], dayfirst = True)

for col in date_columns:
    df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    df[f'{col}_month'] = df[col].dt.month
    # df[f'{col}_year'] = df[col].dt.year
    
df.drop(columns = date_columns, inplace = True)

df

In [None]:
# df[f'Young'] = df["Gebruiker_Leeftijd"] <= df["Gebruiker_Leeftijd"].median()
# # df[f'Old'] = df["Gebruiker_Leeftijd"] > df["Gebruiker_Leeftijd"].mode()[0]
# 
# df.drop(columns = ['Gebruiker_Leeftijd'], inplace = True)
# 
# df

In [None]:
plt.figure(figsize = (20, 8))
sns.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
# # Try predicting Aanwezigheidsstatus (0 = aanwezig, 1 = afwezig)
# df['Aanwezigheidsstatus'] = df['Aanwezigheidsstatus'].replace(2, 1)

all_predictors = df.columns.tolist()
outcome = 'Gebruiker_Lidmaatschapstype'

all_predictors.remove(outcome)
all_predictors.remove('EvenementID')
all_predictors.remove('GebruikerID')
all_predictors.remove('Evenement_OrganisatorID')

model = LogisticRegression(max_iter = 10000)

In [None]:
predictors, score = selection(model, df, df[outcome], all_predictors, mode = 'forward', metric = 'acc')

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import accuracy_score

sfs = SFS(model,
          k_features = 'best',
          forward = True,
          floating = False,
          scoring = 'accuracy',
          cv = 2)

sfs = sfs.fit(df[predictors], df[outcome])

print(f"Selected features: {list(sfs.k_feature_names_)}")
print(f'Score: {sfs.k_score_}')

In [None]:
print(f'Naive rule: {df[outcome].sum() / df.shape[0]}')