In [62]:
# scikit-surprise
# https://stackoverflow.com/questions/61365790/error-could-not-build-wheels-for-scipy-which-use-pep-517-and-cannot-be-installe
# pip install --upgrade pip setuptools wheel / pip3 install scikit-surprise

In [63]:
import os
import sqlalchemy
import pandas as pd
from dotenv import load_dotenv 
from surprise import Reader, Dataset
from sqlalchemy import create_engine, text
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [64]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [65]:
query = text('SELECT * FROM epic_5')
df = pd.read_sql_query(query, connection)
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,A0107812-C867-E811-80F1-001DD8B72B62,92C9E001-1DB4-EC11-983F-00224883CCEA,1,0,Offline,Netwerkevenement,1,1,1,1,1,1,1,1,1,1
1,F3C6510E-27F1-E711-80EF-001DD8B72B61,75588C75-126D-E711-80E8-001DD8B72B61,1,0,Offline,Opleiding,0,0,0,0,0,0,0,0,0,0
2,01A4B561-2E30-E811-80EF-001DD8B72B62,8EC772D0-FEB8-E911-8104-001DD8B72B62,1,0,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
3,5B3DF12A-5D69-E111-B43A-00505680000A,FCF1C2CF-3F55-E911-80FD-001DD8B72B61,1,0,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
4,5C4289FC-B1FE-E811-80F9-001DD8B72B61,9990B71F-EF95-EA11-8111-001DD8B72B62,1,0,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48724 entries, 0 to 48723
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           48724 non-null  object
 1   CampagneId                          48724 non-null  object
 2   aantal_sessies                      48724 non-null  int64 
 3   aantal_bezoeken                     48724 non-null  int64 
 4   SoortCampagne                       48724 non-null  object
 5   TypeCampagne                        48724 non-null  object
 6   ThemaDuurzaamheid                   48724 non-null  int64 
 7   ThemaFinancieelFiscaal              48724 non-null  int64 
 8   ThemaInnovatie                      48724 non-null  int64 
 9   ThemaInternationaalOndernemen       48724 non-null  int64 
 10  ThemaMobiliteit                     48724 non-null  int64 
 11  ThemaOmgeving                       48724 non-null  in

In [67]:
encoder = OneHotEncoder(sparse_output=False)

encoded_columns = encoder.fit_transform(df[['SoortCampagne', 'TypeCampagne']])
feature_names = encoder.get_feature_names_out(['SoortCampagne', 'TypeCampagne'])

encoded_df = pd.DataFrame(encoded_columns, columns=feature_names)
encoded_df = encoded_df.astype("int64")

df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['SoortCampagne', 'TypeCampagne'], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48724 entries, 0 to 48723
Data columns (total 23 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           48724 non-null  object
 1   CampagneId                          48724 non-null  object
 2   aantal_sessies                      48724 non-null  int64 
 3   aantal_bezoeken                     48724 non-null  int64 
 4   ThemaDuurzaamheid                   48724 non-null  int64 
 5   ThemaFinancieelFiscaal              48724 non-null  int64 
 6   ThemaInnovatie                      48724 non-null  int64 
 7   ThemaInternationaalOndernemen       48724 non-null  int64 
 8   ThemaMobiliteit                     48724 non-null  int64 
 9   ThemaOmgeving                       48724 non-null  int64 
 10  ThemaSalesMarketingCommunicatie     48724 non-null  int64 
 11  ThemaStrategieEnAlgemeenManagement  48724 non-null  in