In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from feature_selectors import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
df = merge_on_all(bericht_df, evenement_df, gebruikers_df, aanwezigheid_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Factorize
to_factorize = ["Aanwezigheidsstatus", "Gebruiker_Lidmaatschapstype", "BerichtSentiment"]

for col in to_factorize:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")
        
# Change range from -1 (negatief) to 1 (positief)
df['BerichtSentiment'] = df['BerichtSentiment'].replace(2, -1)

df['Gebruiker_Lidmaatschapstype'] = df['Gebruiker_Lidmaatschapstype'].astype(bool)

# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

df

In [None]:
# Dummies from evenemnttype and studierichting
df = pd.concat([df, pd.get_dummies(df['Evenement_EvenementType'], prefix = 'Evenement_EvenementType')], axis = 1)
df = pd.concat([df, pd.get_dummies(df['Gebruiker_Studierichting'], prefix = 'Gebruiker_Studierichting')], axis = 1)
df.drop(columns = ['Evenement_EvenementType', 'Gebruiker_Studierichting'], axis = 1, inplace = True)

df

In [None]:
# Convert columns to datetime datatype
date_columns = ['BerichtDatum', 'Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], dayfirst = True)

# Extract date/month/year from datetime columns
for col in date_columns:
    df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    df[f'{col}_month'] = df[col].dt.month
    # df[f'{col}_year'] = df[col].dt.year
    
df.drop(columns = date_columns, inplace = True)

df

In [None]:
plt.figure(figsize = (20, 8))
sns.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
def get_all_predictors():
    all_predictors = df.columns.tolist()
    all_predictors.remove('BerichtID')
    all_predictors.remove('EvenementID')
    all_predictors.remove('GebruikerID')
    all_predictors.remove('Evenement_OrganisatorID')

    return all_predictors

In [None]:
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(df[all_predictors])
# scaled_df = pd.DataFrame(scaled_data, columns = all_predictors)
# scaled_df

In [None]:
# Aanwezigheidsstatus (0 = aanwezig, 1 = afwezig)
df['Aanwezigheidsstatus'] = df['Aanwezigheidsstatus'].replace(2, 1).astype('bool', )

model = LogisticRegression(max_iter=10000)
# model = KNeighborsClassifier(n_neighbors=1)
results = {}

outcomes = df.select_dtypes(include='bool').columns.tolist()
for outcome in outcomes:
    all_predictors = get_all_predictors()
    all_predictors.remove(outcome)

    predictors, score = selection(model, df, df[outcome], all_predictors, mode='forward', metric='acc')

    naive_rule = df[outcome].sum() / df.shape[0]
    lift = score / naive_rule

    results[outcome] = (score, lift)

    print(
        f'Predicting: {outcome}\nBest predictors: {predictors}\nScore: {score}\nNaive rule: {naive_rule}\nLift: {lift}\n')

results = pd.DataFrame({
    'Outcome': results.keys(),
    'Score': [value[0] for value in results.values()],
    'Lift': [value[1] for value in results.values()]
})

results

In [None]:
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from sklearn.metrics import accuracy_score
# 
# sfs = SFS(model,
#           k_features = 'best',
#           forward = True,
#           floating = False,
#           scoring = 'accuracy',
#           cv = 10)
# 
# sfs = sfs.fit(df[predictors], df[outcomes])
# 
# print(f"Selected features: {list(sfs.k_feature_names_)}")
# print(f'Score: {sfs.k_score_}')
# print(f'Naive rule: {df[outcomes].sum() / df.shape[0]}')