In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

from feature_selectors import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
df = merge_on_aanwezigheid(aanwezigheid_df, evenement_df, gebruikers_df)

df

In [None]:
# Remove nonvalid entries
for col in df.columns:
    df = df[~(df[col].isna())]

df = df[~(df['Gebruiker_Lidmaatschapstype'] == 'x')]
df

In [None]:
# Convert ID to ints
to_convert = ["EvenementID", "GebruikerID", "Evenement_OrganisatorID"]

for col in to_convert:
    df[col] = df[col].str[1:].astype(int)
    
df

In [None]:
# Factorize
to_factorize = ["Aanwezigheidsstatus", "Gebruiker_Lidmaatschapstype"]

for col in to_factorize:
    df[col], unique_values = pd.factorize(df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")
        
df['Gebruiker_Lidmaatschapstype'] = df['Gebruiker_Lidmaatschapstype'].astype(bool)

# Factorize studiejaar with map
df["Gebruiker_Studiejaar"] = df["Gebruiker_Studiejaar"].map({
    "eerstejaars": 1,
    "tweedejaars": 2,
    "derdejaars": 3,
    "vierdejaars": 4
})

df

In [None]:
# Dummies from evenemnttype and studierichting
df = pd.concat([df, pd.get_dummies(df['Evenement_EvenementType'], prefix = 'Evenement_EvenementType')], axis = 1)
df = pd.concat([df, pd.get_dummies(df['Gebruiker_Studierichting'], prefix = 'Gebruiker_Studierichting')], axis = 1)
df.drop(columns = ['Evenement_EvenementType', 'Gebruiker_Studierichting'], axis = 1, inplace = True)

df

In [None]:
# Convert columns to datetime datatype
date_columns = ['Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum', 'Gebruiker_LaatsteLogin']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], dayfirst = True)

# Extract date/month/year from datetime columns
for col in date_columns:
    df[f'{col}_is_weekend'] = df[col].dt.weekday >= 5
    df[f'{col}_month'] = df[col].dt.month
    # df[f'{col}_year'] = df[col].dt.year
    
df.drop(columns = date_columns, inplace = True)

df

In [None]:
# df[f'Young'] = df["Gebruiker_Leeftijd"] <= df["Gebruiker_Leeftijd"].median()
# # df[f'Old'] = df["Gebruiker_Leeftijd"] > df["Gebruiker_Leeftijd"].mode()[0]
# 
# df.drop(columns = ['Gebruiker_Leeftijd'], inplace = True)
# 
# df

In [None]:
plt.figure(figsize = (20, 8))
sns.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
def get_all_predictors():
    all_predictors = df.columns.tolist()
    all_predictors.remove('EvenementID')
    all_predictors.remove('GebruikerID')
    all_predictors.remove('Evenement_OrganisatorID')

    return all_predictors

In [None]:
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(df[all_predictors])
# scaled_df = pd.DataFrame(scaled_data, columns = all_predictors)
# scaled_df

In [None]:
# van alle leden; 0 = aanwezig, 1 = afwezig
df['Aanwezigheidsstatus'] = df['Aanwezigheidsstatus'].replace(2, 1).astype('bool', )

## van de geregistreerden; aanwezig = 0 en afwezig = 1
# df['Aanwezigheidsstatus'] = df['Aanwezigheidsstatus'].astype('bool')
# df = df[df['Aanwezigheidsstatus'] != 2]

model = LogisticRegression(max_iter=10000)
results = {}

outcomes = df.select_dtypes(include='bool').columns.tolist()
for outcome in outcomes:
    all_predictors = get_all_predictors()
    
    if 'EvenementType' in outcome:
        list = [predictor for predictor in all_predictors if predictor.startswith('Evenement_EvenementType')]
        for predictor in list:
            all_predictors.remove(predictor)

    elif 'Studierichting' in outcome:
        list = [predictor for predictor in all_predictors if predictor.startswith('Gebruiker_Studierichting')]
        for predictor in list:
            all_predictors.remove(predictor)
            
    else:
        all_predictors.remove(outcome)    

    predictors, score = selection(model, df, df[outcome], all_predictors, mode='forward', metric='acc')

    naive_rule = df[outcome].sum() / df.shape[0]
    lift = score / naive_rule

    results[outcome] = (score, lift)
    
    if (score == naive_rule) or (score + naive_rule == 1):
        print("!Naive rule is equal to score")
        
    print(
        f'Predicting: {outcome}\nBest predictors: {predictors}\nScore: {score}\nNaive rule: {naive_rule}\nLift: {lift}\n')

results = pd.DataFrame({
    'Outcome': results.keys(),
    'Score': [value[0] for value in results.values()],
    'Lift': [value[1] for value in results.values()]
})

results

In [None]:
#alleen lidmaatschapstype en evenementtype bbq nemen niet de naive rule aan als beste predictor, dus die gebruik ik als target in de volgende cel

In [None]:
outcome = 'Gebruiker_Lidmaatschapstype'
# outcome = 'Evenement_EvenementType_bbq'

#als je outcome 'EvenementType' bevat, dan moet je de overige evenement types niet meenemen als predictor
if 'EvenementType' in outcome:
    list = [predictor for predictor in all_predictors if predictor.startswith('Evenement_EvenementType')]
    for predictor in list:
        all_predictors.remove(predictor)

#als je outcome 'Studierichting' bevat, dan moet je de overige studierichtingen niet meenemen als predictor
elif 'Studierichting' in outcome:
    list = [predictor for predictor in all_predictors if predictor.startswith('Gebruiker_Studierichting')]
    for predictor in list:
            all_predictors.remove(predictor)
        
#verwijder de outcome uit de predictors
else:
    all_predictors.remove(outcome)
        
#bepaal de beste predictors voor de outcome
predictors, score = selection(model, df, df[outcome], all_predictors, mode='forward', metric='acc')

#splits de dataset in train en test            
train_x, test_x, train_y, test_y = train_test_split(df[predictors], df[outcome], train_size=0.6, random_state=0)

In [None]:
# Train model
model.fit(train_x, train_y)

print('Beta coefficients:')
for predictor, coef in zip(predictors, model.coef_[0]):
    print(f'\t{predictor}: {coef}')
print(f'\nIntercept: {model.intercept_}')

In [None]:
# Verify with train dataset
train_pred = model.predict(train_x)
train_results = pd.DataFrame({
    outcome: train_y,
    'Predicted': train_pred
})

train_results.head()

In [None]:
# Test with test dataset
test_pred = model.predict(test_x)
test_results = pd.DataFrame({
    outcome: test_y,
    'Predicted': test_pred
})

test_results.head()

In [None]:
print('Train data\nColumns = predicted\nRows = true')
confusion_matrix(train_y, train_pred)

In [None]:
print('Test data\nColumns = predicted\nRows = true')
confusion_matrix(test_y, test_pred)

In [None]:
# Results
print(f'Accuracy for train data: {accuracy_score(train_y, train_pred)}')
print(f'Accuracy for test data: {accuracy_score(test_y, test_pred)}')
print()
print(f'Precision for train data: {precision_score(train_y, train_pred)}')
print(f'Precision for test data: {precision_score(test_y, test_pred)}')
print()
print(f'Recall score for train data: {recall_score(train_y, train_pred)}')
print(f'Recall score for test data: {recall_score(test_y, test_pred)}')
print()
print(f'F1 score for train data: {f1_score(train_y, train_pred)}')
print(f'F1 score for test data: {f1_score(test_y, test_pred)}')

In [None]:
states = 200
metrics = {
    "accuracy_train": 0, "accuracy_test": 0,
    "precision_train": 0, "precision_test": 0,
    "recall_train": 0, "recall_test": 0,
    "f1_train": 0, "f1_test": 0
}

for state in range(states):
    train_x, test_x, train_y, test_y = train_test_split(df[predictors], df[outcome], train_size = 0.6, random_state = state)
    model.fit(train_x, train_y)

    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)

    # Update metrics
    metrics["accuracy_train"] += accuracy_score(train_y, train_pred)
    metrics["accuracy_test"] += accuracy_score(test_y, test_pred)
    metrics["precision_train"] += precision_score(train_y, train_pred, zero_division = 0)
    metrics["precision_test"] += precision_score(test_y, test_pred, zero_division = 0)
    metrics["recall_train"] += recall_score(train_y, train_pred)
    metrics["recall_test"] += recall_score(test_y, test_pred)
    metrics["f1_train"] += f1_score(train_y, train_pred)
    metrics["f1_test"] += f1_score(test_y, test_pred)

# Print averaged results
for metric, total in metrics.items():
    print(f"Average {metric.replace('_', ' ')}: {total / states}")