In [1]:
# scikit-surprise
# https://learn.microsoft.com/en-US/cpp/windows/latest-supported-vc-redist?view=msvc-170#visual-studio-2015-2017-2019-and-2022
# https://stackoverflow.com/questions/61365790/error-could-not-build-wheels-for-scipy-which-use-pep-517-and-cannot-be-installe
# pip install --upgrade pip setuptools wheel / pip3 install scikit-surprise

In [2]:
import os
import sqlalchemy
import pandas as pd
from dotenv import load_dotenv 
from surprise import Reader, Dataset
from sqlalchemy import create_engine, text
from sklearn.preprocessing import OneHotEncoder
from surprise.model_selection import train_test_split

In [3]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [4]:
query = text('SELECT * FROM epic_5')
df = pd.read_sql_query(query, connection)
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,B954637B-12D5-E311-B4EE-005056B06EB4,28CF4C61-0F0A-E911-80FA-001DD8B72B62,11,0,Offline,Project,0,0,0,0,0,0,0,0,0,0
1,93CB9C20-B237-E611-8A0E-005056B06EC4,6AB43832-327D-E911-80FE-001DD8B72B62,1,0,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
2,D54F80FA-B667-E111-A00F-00505680000A,4F63DCFB-A13D-ED11-9DB0-6045BD8952CE,1,1,Offline,Netwerkevenement,1,1,1,1,1,1,1,1,1,1
3,72DD0357-8FC8-E411-AC4D-005056B06EC4,C6D72260-E451-EC11-8C62-000D3ABFCF4A,2,0,Online,Netwerkevenement,1,1,1,0,0,0,1,0,1,1
4,CFEE0AFA-5969-E111-B43A-00505680000A,29668F0F-6C86-EB11-811A-001DD8B72B61,12,0,Offline,Netwerkevenement,0,0,0,0,1,0,1,0,1,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48724 entries, 0 to 48723
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           48724 non-null  object
 1   CampagneId                          48724 non-null  object
 2   aantal_sessies                      48724 non-null  int64 
 3   aantal_bezoeken                     48724 non-null  int64 
 4   SoortCampagne                       48724 non-null  object
 5   TypeCampagne                        48724 non-null  object
 6   ThemaDuurzaamheid                   48724 non-null  int64 
 7   ThemaFinancieelFiscaal              48724 non-null  int64 
 8   ThemaInnovatie                      48724 non-null  int64 
 9   ThemaInternationaalOndernemen       48724 non-null  int64 
 10  ThemaMobiliteit                     48724 non-null  int64 
 11  ThemaOmgeving                       48724 non-null  in

In [6]:
encoder = OneHotEncoder(sparse_output=False)

encoded_columns = encoder.fit_transform(df[['SoortCampagne', 'TypeCampagne']])
feature_names = encoder.get_feature_names_out(['SoortCampagne', 'TypeCampagne'])

encoded_df = pd.DataFrame(encoded_columns, columns=feature_names)
encoded_df = encoded_df.astype("int64")

df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['SoortCampagne', 'TypeCampagne'], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48724 entries, 0 to 48723
Data columns (total 23 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           48724 non-null  object
 1   CampagneId                          48724 non-null  object
 2   aantal_sessies                      48724 non-null  int64 
 3   aantal_bezoeken                     48724 non-null  int64 
 4   ThemaDuurzaamheid                   48724 non-null  int64 
 5   ThemaFinancieelFiscaal              48724 non-null  int64 
 6   ThemaInnovatie                      48724 non-null  int64 
 7   ThemaInternationaalOndernemen       48724 non-null  int64 
 8   ThemaMobiliteit                     48724 non-null  int64 
 9   ThemaOmgeving                       48724 non-null  int64 
 10  ThemaSalesMarketingCommunicatie     48724 non-null  int64 
 11  ThemaStrategieEnAlgemeenManagement  48724 non-null  in

In [7]:
reader = Reader(rating_scale=(0, int(df.aantal_sessies.max())))
data = Dataset.load_from_df(df[['PersoonId', 'CampagneId', 'aantal_sessies']], reader)

trainset, testset = train_test_split(data, test_size=.25)

In [8]:
# https://blog.stackademic.com/recommender-systems-with-python-code-examples-8d2ed0995f9a
# hybrid model met content based filtering (cold start oplossing) en collaborative filtering