In [1]:
# scikit-surprise
# https://learn.microsoft.com/en-US/cpp/windows/latest-supported-vc-redist?view=msvc-170#visual-studio-2015-2017-2019-and-2022
# https://stackoverflow.com/questions/61365790/error-could-not-build-wheels-for-scipy-which-use-pep-517-and-cannot-be-installe
# pip install --upgrade pip setuptools wheel / pip3 install scikit-surprise

In [2]:
import os
import sqlalchemy
import pandas as pd
from dotenv import load_dotenv 
from surprise import Reader, Dataset
from sqlalchemy import create_engine, text
from sklearn.preprocessing import OneHotEncoder
from surprise.model_selection import train_test_split
from surprise import KNNBasic

In [3]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [4]:
query = text('SELECT * FROM epic_5')
df = pd.read_sql_query(query, connection)
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,12E2CFFD-4069-E111-B43A-00505680000A,2D2494E9-0822-E811-80F0-001DD8B72B61,1,0,Netwerking,Offline,Netwerkevenement,1,1,1,1,1,1,1,1,1,1
1,4AF5B635-843D-E411-9EE6-005056B06EC4,A534E7FC-EEF3-E711-80EE-001DD8B72B62,1,0,Algemeen Management,Offline,Infosessie,0,0,0,0,0,0,0,0,0,0
2,F6EE77B3-4216-E811-80EF-001DD8B72B61,9335B453-16E5-E911-8106-001DD8B72B62,1,0,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
3,969F7BEB-BB2B-E611-BEEF-005056B06EB4,0BB4BF28-C3C4-EC11-A7B6-000D3A497E09,1,1,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
4,3CD88458-FDD0-E711-80EC-001DD8B72B62,1EA0E16F-D185-E811-80F3-001DD8B72B61,1,0,Internationaal Ondernemen,Offline,Opleiding,0,0,0,0,0,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   aantal_bezoeken                     52567 non-null  int64 
 4   SessieThema                         52567 non-null  object
 5   SoortCampagne                       52567 non-null  object
 6   TypeCampagne                        52567 non-null  object
 7   ThemaDuurzaamheid                   52567 non-null  int64 
 8   ThemaFinancieelFiscaal              52567 non-null  int64 
 9   ThemaInnovatie                      52567 non-null  int64 
 10  ThemaInternationaalOndernemen       52567 non-null  int64 
 11  ThemaMobiliteit                     52567 non-null  in

In [6]:
encoder = OneHotEncoder(sparse_output=False)

encoded_columns = encoder.fit_transform(df[['SoortCampagne', 'TypeCampagne', 'SessieThema']])
feature_names = encoder.get_feature_names_out(['SoortCampagne', 'TypeCampagne', 'SessieThema'])

encoded_df = pd.DataFrame(encoded_columns, columns=feature_names)
encoded_df = encoded_df.astype("int64")

df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['SoortCampagne', 'TypeCampagne', 'SessieThema'], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 64 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   PersoonId                                           52567 non-null  object
 1   CampagneId                                          52567 non-null  object
 2   aantal_sessies                                      52567 non-null  int64 
 3   aantal_bezoeken                                     52567 non-null  int64 
 4   ThemaDuurzaamheid                                   52567 non-null  int64 
 5   ThemaFinancieelFiscaal                              52567 non-null  int64 
 6   ThemaInnovatie                                      52567 non-null  int64 
 7   ThemaInternationaalOndernemen                       52567 non-null  int64 
 8   ThemaMobiliteit                                     52567 non-null  int64 
 9   ThemaO

In [7]:
reader = Reader(rating_scale=(0, int(df.aantal_sessies.max())))
data = Dataset.load_from_df(df[['PersoonId', 'CampagneId', 'aantal_sessies']], reader)

trainset, testset = train_test_split(data, test_size=.25)

In [8]:
# https://blog.stackademic.com/recommender-systems-with-python-code-examples-8d2ed0995f9a
# hybrid model met content based filtering (cold start oplossing) en collaborative filtering

In [9]:
reader = Reader(rating_scale=(0, int(df.aantal_sessies.max())))
data = Dataset.load_from_df(df[['PersoonId', 'CampagneId', 'aantal_sessies']], reader)

trainset, testset = train_test_split(data, test_size=.25)

In [10]:
# Use user-based KNN
sim_options = {'name': 'pearson', 'user_based': False}
model = KNNBasic(sim_options=sim_options)

# Train the model
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

def get_top_n_recommendations(model, campaign_id, n=10):
    # Find users who haven't interacted with the given campaign
    anti_testset = trainset.build_anti_testset()
    anti_testset_campaign = filter(lambda x: x[1] == campaign_id, anti_testset)

    # Predict ratings for users on the given campaign
    predictions = model.test(anti_testset_campaign)

    # Get the top N recommendations
    top_n = [pred.uid for pred in sorted(predictions, key=lambda x: x.est, reverse=True)[:n]]

    return top_n

campagne = "08563B4A-6871-ED11-9561-6045BD8952CE"
top_recommendations = get_top_n_recommendations(model, campagne)
top_recommendations

Computing the pearson similarity matrix...
Done computing similarity matrix.


['E92F31E5-EB67-E111-A00F-00505680000A',
 'F05D022D-0E57-EA11-810C-001DD8B72B62',
 'E8C7A7AC-5C69-E111-B43A-00505680000A',
 '6CE11D6E-4969-E111-B43A-00505680000A',
 'EA7076D5-F0CC-E211-A980-005056B06EB4',
 '6B73A3B8-5C69-E111-B43A-00505680000A',
 'DE094372-9927-E411-ACD3-005056B06EC4',
 '54A1A236-F767-E111-A00F-00505680000A',
 'FF365120-5069-E111-B43A-00505680000A',
 '05986568-E467-E111-A00F-00505680000A']