# Probeersel, zie implicitALS.ipynb

SOURCE: https://medium.com/@speedfirefox1/games-recommender-system-using-lightfm-on-steam-dataset-76b05de4c187

In [27]:
import os
import sqlalchemy
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

import itertools
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
sys.path.append('..')

# pip install lightfm
import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

print("LightFM version: {}".format(lightfm.__version__))

LightFM version: 1.17


In [28]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [29]:
# materialized view
query = text('SELECT * FROM epic_5')

try:
    df = pd.read_sql_query(query, connection)
except Exception as e:
    print(f"Failed to execute query: {e}")

df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,482596DF-7269-E111-B43A-00505680000A,6AB43832-327D-E911-80FE-001DD8B72B62,1,0,Ruimtelijke ordening en Infrastructuur,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
1,969F7BEB-BB2B-E611-BEEF-005056B06EB4,0BB4BF28-C3C4-EC11-A7B6-000D3A497E09,1,1,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
2,456581CA-BAF6-E311-956D-005056B06EB4,92C9E001-1DB4-EC11-983F-00224883CCEA,1,2,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
3,2F3D63A8-E167-E111-A00F-00505680000A,932690EF-20B1-ED11-83FF-6045BD895B5A,1,1,Groeien,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
4,012C8C55-6769-E111-B43A-00505680000A,376FE12F-99B5-EC11-983F-00224883CCEA,14,0,Algemeen Management,Offline,Project,0,0,0,0,0,0,0,0,0,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   aantal_bezoeken                     52567 non-null  int64 
 4   SessieThema                         52567 non-null  object
 5   SoortCampagne                       52567 non-null  object
 6   TypeCampagne                        52567 non-null  object
 7   ThemaDuurzaamheid                   52567 non-null  int64 
 8   ThemaFinancieelFiscaal              52567 non-null  int64 
 9   ThemaInnovatie                      52567 non-null  int64 
 10  ThemaInternationaalOndernemen       52567 non-null  int64 
 11  ThemaMobiliteit                     52567 non-null  in

In [31]:
df['ThemaDuurzaamheid'] = df['ThemaDuurzaamheid'].replace({0: 'False', 1: 'True'})
df['ThemaFinancieelFiscaal'] = df['ThemaFinancieelFiscaal'].replace({0: 'False', 1: 'True'})
df['ThemaInnovatie'] = df['ThemaInnovatie'].replace({0: 'False', 1: 'True'})
df['ThemaInternationaalOndernemen'] = df['ThemaInternationaalOndernemen'].replace({0: 'False', 1: 'True'})
df['ThemaMobiliteit'] = df['ThemaMobiliteit'].replace({0: 'False', 1: 'True'})
df['ThemaOmgeving'] = df['ThemaOmgeving'].replace({0: 'False', 1: 'True'})
df['ThemaSalesMarketingCommunicatie'] = df['ThemaSalesMarketingCommunicatie'].replace({0: 'False', 1: 'True'})
df['ThemaStrategieEnAlgemeenManagement'] = df['ThemaStrategieEnAlgemeenManagement'].replace({0: 'False', 1: 'True'})
df['ThemaTalent'] = df['ThemaTalent'].replace({0: 'False', 1: 'True'})
df['ThemaWelzijn'] = df['ThemaWelzijn'].replace({0: 'False', 1: 'True'})

In [32]:
df['aantal_sessies'] = df['aantal_sessies'].astype(str)

In [33]:
df.drop('aantal_bezoeken', axis=1, inplace=True)

In [34]:
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,482596DF-7269-E111-B43A-00505680000A,6AB43832-327D-E911-80FE-001DD8B72B62,1,Ruimtelijke ordening en Infrastructuur,Offline,Netwerkevenement,False,False,False,False,False,False,False,False,False,False
1,969F7BEB-BB2B-E611-BEEF-005056B06EB4,0BB4BF28-C3C4-EC11-A7B6-000D3A497E09,1,Netwerking,Offline,Netwerkevenement,False,False,False,False,False,False,False,False,False,False
2,456581CA-BAF6-E311-956D-005056B06EB4,92C9E001-1DB4-EC11-983F-00224883CCEA,1,Netwerking,Offline,Netwerkevenement,False,False,False,False,False,False,False,False,False,False
3,2F3D63A8-E167-E111-A00F-00505680000A,932690EF-20B1-ED11-83FF-6045BD895B5A,1,Groeien,Offline,Netwerkevenement,False,False,False,False,False,False,False,False,False,False
4,012C8C55-6769-E111-B43A-00505680000A,376FE12F-99B5-EC11-983F-00224883CCEA,14,Algemeen Management,Offline,Project,False,False,False,False,False,False,False,False,False,False


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  object
 3   SessieThema                         52567 non-null  object
 4   SoortCampagne                       52567 non-null  object
 5   TypeCampagne                        52567 non-null  object
 6   ThemaDuurzaamheid                   52567 non-null  object
 7   ThemaFinancieelFiscaal              52567 non-null  object
 8   ThemaInnovatie                      52567 non-null  object
 9   ThemaInternationaalOndernemen       52567 non-null  object
 10  ThemaMobiliteit                     52567 non-null  object
 11  ThemaOmgeving                       52567 non-null  ob

In [36]:
item_cols = ['SessieThema', 'SoortCampagne', 'TypeCampagne', 'aantal_sessies']
user_cols = ['ThemaDuurzaamheid', 'ThemaFinancieelFiscaal', 'ThemaInnovatie', 'ThemaInternationaalOndernemen', 'ThemaMobiliteit', 'ThemaOmgeving', 'ThemaSalesMarketingCommunicatie', 'ThemaStrategieEnAlgemeenManagement', 'ThemaTalent', 'ThemaWelzijn']

In [37]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score

def calculate_metrics(model, interactions, train, item_features, user_features, k):
    precision = precision_at_k(model, interactions, train, item_features=item_features, user_features=user_features, k=k).mean()
    recall = recall_at_k(model, interactions, train, item_features=item_features, user_features=user_features, k=k).mean()
    auc = auc_score(model, interactions, train, item_features=item_features, user_features=user_features).mean()
    return precision, recall, auc

In [38]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.1
# model learning rate
LEARNING_RATE = 0.01
# no of latent factors
NO_COMPONENTS = 200
# no of epochs to fit model
NO_EPOCHS = 10
# no of threads to fit model
NO_THREADS = 8

# regularisation for both user and item features
ITEM_ALPHA = 0
USER_ALPHA = 0

checkpoint = 'lightFM_hybrid'
# seed for pseudonumber generations
SEED = 42

In [39]:
all_item_features = np.concatenate([df[col].unique() for col in item_cols]).tolist()
all_user_features = np.concatenate([df[col].unique() for col in user_cols]).tolist()

In [40]:
print(all_item_features)
print(all_user_features)

['Ruimtelijke ordening en Infrastructuur', 'Netwerking', 'Groeien', 'Algemeen Management', 'Unknown', 'Haven', 'Duurzaam Ondernemen', 'Welt 2.0-2023', 'Human Resources', 'Internationaal Ondernemen', 'Communicatie', 'Financieel', 'Starten', 'Digitalisering, IT & Technologie', 'Opvolging en Overname', 'Welt 2.0', 'Arbeidsmarkt', 'Jong Voka', 'Opleidingen', 'Marketing & Sales', 'Innovatie', 'Juridisch', 'Bryo', 'Milieu', 'Welt', 'Familiebedrijven', 'Plato', 'Strategie', 'Economie', 'Lidmaatschap', 'Logistiek en Transport', 'Mobiliteit', 'Onderwijs', 'Supply Chain', 'Veiligheid & Preventie', 'Energie', 'Persoonlijke vaardigheden', 'Coronavirus', 'Aankoop', 'Retail', 'Aantrekkelijke regio', 'Offline', 'Online', 'On en Offline', 'Netwerkevenement', 'Project', 'Opleiding', 'Infosessie', 'Campagne', 'Projectgebonden', '1', '14', '0', '2', '4', '11', '6', '3', '8', '13', '5', '10', '17', '18', '7', '12', '9', '15', '16', '26', '20', '27', '24', '22']
['False', 'True', 'False', 'True', 'False', 

In [41]:
dataset = Dataset()

In [42]:
dataset.fit(
    users=df['PersoonId'].unique(),
    items=df['CampagneId'].unique(),
    user_features=all_user_features,
    item_features=all_item_features
)

# quick check to determine the number of unique users and items in the data
num_users, num_items = dataset.interactions_shape()
print(f'Num users: {num_users}, num_items: {num_items}')

Num users: 16688, num_items: 1979


In [43]:
(interactions, weights) = dataset.build_interactions(zip(df['PersoonId'], df['CampagneId']))

In [44]:
unique_campaign_ids = df['CampagneId'].unique()
items = pd.DataFrame(columns=['CampagneId'])
items['CampagneId'] = unique_campaign_ids

unique_person_ids = df['PersoonId'].unique()
users = pd.DataFrame(columns=['PersoonId'])
users['PersoonId'] = unique_person_ids

In [45]:
def item_feature_generator():
    for i, row in items.iterrows():
        features = row.values[2:-1]
        yield (row['CampagneId'], features)

def user_feature_generator():
    for i, row in users.iterrows():
        features = row.values[1:]
        yield (row['PersoonId'], features)

In [46]:
item_features = dataset.build_item_features((item_id, item_feature) for item_id, item_feature in item_feature_generator())
user_features = dataset.build_user_features((user_id, user_feature) for user_id, user_feature in user_feature_generator())

In [47]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(seed=SEED)
)

In [48]:
uids, iids, data_interaction = cross_validation._shuffle(interactions.row, interactions.col, interactions.data, np.random.RandomState(SEED))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)
train_idx = slice(None, cutoff)

test_uids, test_iids = uids[test_idx], iids[test_idx]
train_uids, train_iids = uids[train_idx], iids[train_idx]

In [49]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (16688, 1979)
Shape of test interactions: (16688, 1979)


In [50]:
model = LightFM(
    no_components=NO_COMPONENTS,
    learning_rate=LEARNING_RATE,
    random_state=np.random.RandomState(SEED)
)

In [51]:
def save_model(model):
    with open(f'{checkpoint}.pickle', 'wb') as fle:
        pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
train_history = {
    'AUC': [],
    'Precision': [],
    'Recall': [],
}

test_history = {
    'AUC': [],
    'Precision': [],
    'Recall': [],
}

best_score = 0

for epoch in tqdm(range(NO_EPOCHS)):

    model.fit_partial(
        interactions=train_interactions,
        user_features=user_features,
        item_features=item_features,
        epochs=1,
        num_threads=NO_THREADS
    )

    train_precision, train_recall, train_auc = calculate_metrics(model, train_interactions, None, item_features, user_features, K)
    test_precision, test_recall, test_auc = calculate_metrics(model, test_interactions, train_interactions,  item_features, user_features, K)

    train_history['AUC'].append(train_auc)
    train_history['Precision'].append(train_precision)
    train_history['Recall'].append(train_recall)

    test_history['AUC'].append(test_auc)
    test_history['Precision'].append(test_precision)
    test_history['Recall'].append(test_recall)

    if test_auc > best_score:
        best_score = test_auc
        save_model(model)

    print(f'Epoch {epoch + 1}/{NO_EPOCHS}, Train auc: {train_auc}, Test auc: {test_auc}')

  0%|          | 0/10 [00:31<?, ?it/s]


ValueError: Test interactions matrix and train interactions matrix share 534 interactions. This will cause incorrect evaluation, check your data split.