In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

from feature_selectors import selection
from sklearn.preprocessing import StandardScaler

In [None]:
gebruikers_df = pd.read_csv(r'..\..\data\gebruikersinformatie.csv', delimiter = ';')
evenement_df = pd.read_csv(r'..\..\data\evenementinformatie.csv', delimiter = ';')
aanwezigheid_df = pd.read_csv(r'..\..\data\evenementaanwezigheid.csv', delimiter = ';')
bericht_df = pd.read_csv(r'..\..\data\berichtinteracties_met_sentiment.csv', delimiter = ';')

In [None]:
# Merge bericht_df with evenement_df
bericht_df = bericht_df.merge(evenement_df, on = "EvenementID", how = "left")

# Merge the result with gebruikers_df
bericht_df = bericht_df.merge(gebruikers_df, on = "GebruikerID", how = "left")

# Rename columns for consistency
bericht_df.rename(
    columns = {col: f"Evenement_{col}" for col in evenement_df.columns if col != "EvenementID"},
    inplace = True
)

bericht_df.rename(
    columns = {col: f"Gebruiker_{col}" for col in gebruikers_df.columns if col != "GebruikerID"},
    inplace = True
)

bericht_df

In [None]:
# Remove nonvalid entries
bericht_df = bericht_df[~(bericht_df['Gebruiker_Leeftijd'].isna())]
bericht_df = bericht_df[~(bericht_df['Gebruiker_Lidmaatschapstype'] == 'x')]
bericht_df

In [None]:
# Convert ID to ints
to_convert = ["BerichtID", "GebruikerID", "EvenementID", "Evenement_OrganisatorID"]

for col in to_convert:
    bericht_df[col] = bericht_df[col].str[1:].astype(int)
    
bericht_df

In [None]:
# Factorize
to_factorize = ["BerichtSentiment", "Evenement_EvenementType", "Gebruiker_Studierichting", "Gebruiker_Lidmaatschapstype"]

for col in to_factorize:
    bericht_df[col], unique_values = pd.factorize(bericht_df[col])

    print(f"\n{col} numerics:")
    for i, value in enumerate(unique_values):
        print(f"{i} -> {value}")

bericht_df["Gebruiker_Studiejaar"] = bericht_df["Gebruiker_Studiejaar"].map({
    "Eerstejaars": 1,
    "Tweedejaars": 2,
    "Derdejaars": 3,
    "Vierdejaars": 4
})

bericht_df

In [None]:
# Convert columns to datetime datatype and extract day/month/year
date_columns = ['BerichtDatum', 'Evenement_EvenementDatum', 'Gebruiker_RegistratieDatum','Gebruiker_LaatsteLogin']

for col in date_columns:
    bericht_df[col] = pd.to_datetime(bericht_df[col], dayfirst=True)
    
for col in date_columns:
    bericht_df[f'{col}_day'] = bericht_df[col].dt.day
    bericht_df[f'{col}_month'] = bericht_df[col].dt.month
    bericht_df[f'{col}_year'] = bericht_df[col].dt.year
    
bericht_df.drop(columns = date_columns, inplace = True)

bericht_df

In [None]:
plt.figure(figsize = (20, 8))
sns.heatmap(bericht_df.corr(), annot = True)
plt.show()

In [None]:
# Test all outcomes to see which has the highest score
model = LinearRegression()
results = {}

for outcome in bericht_df.columns.tolist():
    test_predictors = bericht_df.columns.tolist()
    test_predictors.remove(outcome)

    predictors, score = selection(model, bericht_df, bericht_df[outcome], test_predictors, mode = "forward",
                                  metric = "adj_r2")

    # results[outcome] = (score, predictors)
    results[outcome] = score

print('\n\nOutcome\t\t\tScore')
for key, value in results.items():
    # print(f'{key} -> {value[0]}')
    print(f'{key} -> {value}')