# LightFM model epic 5

Uitgewerkt maar ruw concept van een hybride recommender system met LightFM.

## 0. Documentatie

[Artikel LightFM](https://medium.com/@speedfirefox1/games-recommender-system-using-lightfm-on-steam-dataset-76b05de4c187)

[Documentatie LightFM](https://making.lyst.com/lightfm/docs/index.html)

## 1. Imports

In [1]:
import os
import pickle
import sqlalchemy
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sqlalchemy import create_engine, text
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import precision_at_k
from lightfm import LightFM, cross_validation



## 2. Variabelen

In [2]:
# LightFM model parameters
SEED = 42
NO_THREADS = 8
NO_EPOCHS = 10
NO_COMPONENTS = 10
TEST_PERCENTAGE = 0.2
LEARNING_RATE = 0.8
ITEM_ALPHA = 1e-7
USER_ALPHA = 1e-7
LOSS = 'logistic'
# Pickle
CHECKPOINT = 'LightFM'

## 3. Data inladen

In [3]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [4]:
# materialized view
query = text('SELECT * FROM epic_5')

try:
    df = pd.read_sql_query(query, connection)
except Exception as e:
    print(f"Failed to execute query: {e}")

df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,67B0AAF0-A4D9-E211-A85C-005056B06EC4,79FF0273-BA6D-EA11-810F-001DD8B72B62,1,0,Human Resources,Offline,Opleiding,0,0,0,0,0,0,0,0,0,0
1,6A55E982-CF3D-E611-80D6-005056B06EC4,CBEE5437-9928-EB11-8117-001DD8B72B62,1,0,Financieel,Online,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
2,7D950125-2574-E611-80DE-001DD8B72B61,8FCA1D31-1EB7-E811-80F4-001DD8B72B62,6,0,Netwerking,Offline,Netwerkevenement,1,1,1,1,1,1,1,1,1,1
3,541ADB1A-E068-EE11-9AE7-6045BD974EB2,7C7F820F-6842-EE11-BDF3-6045BD895BFB,4,0,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
4,C844A936-0B4D-E311-B788-005056B06EB4,68BCCA0B-838E-E811-80F2-001DD8B72B62,1,0,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   aantal_bezoeken                     52567 non-null  int64 
 4   SessieThema                         52567 non-null  object
 5   SoortCampagne                       52567 non-null  object
 6   TypeCampagne                        52567 non-null  object
 7   ThemaDuurzaamheid                   52567 non-null  int64 
 8   ThemaFinancieelFiscaal              52567 non-null  int64 
 9   ThemaInnovatie                      52567 non-null  int64 
 10  ThemaInternationaalOndernemen       52567 non-null  int64 
 11  ThemaMobiliteit                     52567 non-null  in

## 3. Data voorbereiden

In [6]:
# binaire integers van persoon thema's omzetten naar binaire strings

columns = ['ThemaDuurzaamheid', 'ThemaFinancieelFiscaal', 'ThemaInnovatie', 'ThemaInternationaalOndernemen', 
           'ThemaMobiliteit', 'ThemaOmgeving', 'ThemaSalesMarketingCommunicatie', 
           'ThemaStrategieEnAlgemeenManagement', 'ThemaTalent', 'ThemaWelzijn']

for col in columns:
    df[col] = df[col].replace({0: col + '_False', 1: col + '_True'})

In [7]:
# aantal bezoeken laten vallen, focus ligt op aantal_sessies
df.drop('aantal_bezoeken', axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,67B0AAF0-A4D9-E211-A85C-005056B06EC4,79FF0273-BA6D-EA11-810F-001DD8B72B62,1,Human Resources,Offline,Opleiding,ThemaDuurzaamheid_False,ThemaFinancieelFiscaal_False,ThemaInnovatie_False,ThemaInternationaalOndernemen_False,ThemaMobiliteit_False,ThemaOmgeving_False,ThemaSalesMarketingCommunicatie_False,ThemaStrategieEnAlgemeenManagement_False,ThemaTalent_False,ThemaWelzijn_False
1,6A55E982-CF3D-E611-80D6-005056B06EC4,CBEE5437-9928-EB11-8117-001DD8B72B62,1,Financieel,Online,Netwerkevenement,ThemaDuurzaamheid_False,ThemaFinancieelFiscaal_False,ThemaInnovatie_False,ThemaInternationaalOndernemen_False,ThemaMobiliteit_False,ThemaOmgeving_False,ThemaSalesMarketingCommunicatie_False,ThemaStrategieEnAlgemeenManagement_False,ThemaTalent_False,ThemaWelzijn_False
2,7D950125-2574-E611-80DE-001DD8B72B61,8FCA1D31-1EB7-E811-80F4-001DD8B72B62,6,Netwerking,Offline,Netwerkevenement,ThemaDuurzaamheid_True,ThemaFinancieelFiscaal_True,ThemaInnovatie_True,ThemaInternationaalOndernemen_True,ThemaMobiliteit_True,ThemaOmgeving_True,ThemaSalesMarketingCommunicatie_True,ThemaStrategieEnAlgemeenManagement_True,ThemaTalent_True,ThemaWelzijn_True
3,541ADB1A-E068-EE11-9AE7-6045BD974EB2,7C7F820F-6842-EE11-BDF3-6045BD895BFB,4,Netwerking,Offline,Netwerkevenement,ThemaDuurzaamheid_False,ThemaFinancieelFiscaal_False,ThemaInnovatie_False,ThemaInternationaalOndernemen_False,ThemaMobiliteit_False,ThemaOmgeving_False,ThemaSalesMarketingCommunicatie_False,ThemaStrategieEnAlgemeenManagement_False,ThemaTalent_False,ThemaWelzijn_False
4,C844A936-0B4D-E311-B788-005056B06EB4,68BCCA0B-838E-E811-80F2-001DD8B72B62,1,Netwerking,Offline,Netwerkevenement,ThemaDuurzaamheid_False,ThemaFinancieelFiscaal_False,ThemaInnovatie_False,ThemaInternationaalOndernemen_False,ThemaMobiliteit_False,ThemaOmgeving_False,ThemaSalesMarketingCommunicatie_False,ThemaStrategieEnAlgemeenManagement_False,ThemaTalent_False,ThemaWelzijn_False


In [9]:
# item_features en user_features invullen voor lightFM

item_cols = ['SessieThema', 'SoortCampagne', 'TypeCampagne']
user_cols = ['ThemaDuurzaamheid', 'ThemaFinancieelFiscaal', 'ThemaInnovatie', 'ThemaInternationaalOndernemen', 'ThemaMobiliteit', 'ThemaOmgeving', 'ThemaSalesMarketingCommunicatie', 'ThemaStrategieEnAlgemeenManagement', 'ThemaTalent', 'ThemaWelzijn']

all_item_features = np.concatenate([df[col].unique() for col in item_cols]).tolist()
all_user_features = np.concatenate([df[col].unique() for col in user_cols]).tolist()

print(all_item_features)
print(all_user_features)

['Human Resources', 'Financieel', 'Netwerking', 'Economie', 'Innovatie', 'Duurzaam Ondernemen', 'Algemeen Management', 'Starten', 'Lidmaatschap', 'Digitalisering, IT & Technologie', 'Groeien', 'Onderwijs', 'Opvolging en Overname', 'Internationaal Ondernemen', 'Familiebedrijven', 'Unknown', 'Logistiek en Transport', 'Marketing & Sales', 'Communicatie', 'Strategie', 'Haven', 'Milieu', 'Bryo', 'Welt', 'Welt 2.0', 'Plato', 'Arbeidsmarkt', 'Welt 2.0-2023', 'Juridisch', 'Jong Voka', 'Supply Chain', 'Ruimtelijke ordening en Infrastructuur', 'Mobiliteit', 'Opleidingen', 'Energie', 'Veiligheid & Preventie', 'Coronavirus', 'Retail', 'Aankoop', 'Persoonlijke vaardigheden', 'Aantrekkelijke regio', 'Offline', 'Online', 'On en Offline', 'Opleiding', 'Netwerkevenement', 'Project', 'Infosessie', 'Projectgebonden', 'Campagne']
['ThemaDuurzaamheid_False', 'ThemaDuurzaamheid_True', 'ThemaFinancieelFiscaal_False', 'ThemaFinancieelFiscaal_True', 'ThemaInnovatie_False', 'ThemaInnovatie_True', 'ThemaInternat

In [10]:
# toch groeperen op aantal sessies per persoon per campagne, geen scheiding tussen sessiethema
df = df.groupby(['PersoonId', 'CampagneId'])['aantal_sessies'].sum().reset_index()

## 4. Model trainen

In [11]:
dataset = Dataset()

dataset.fit(
    users=df['PersoonId'],
    items=df['CampagneId'],
    user_features=all_user_features,
    item_features=all_item_features
)

# quick check to determine the number of unique users and items in the data
num_users, num_items = dataset.interactions_shape()
print(f'Num users: {num_users}, num_items: {num_items}')

Num users: 16688, num_items: 1979


In [12]:
(interactions, weights) = dataset.build_interactions(zip(df['PersoonId'], df['CampagneId'], df['aantal_sessies']))

In [13]:
unique_campaign_ids = df['CampagneId'].unique()
items = pd.DataFrame(columns=['CampagneId'])
items['CampagneId'] = unique_campaign_ids

unique_person_ids = df['PersoonId'].unique()
users = pd.DataFrame(columns=['PersoonId'])
users['PersoonId'] = unique_person_ids

In [14]:
def item_feature_generator():
    for _, row in items.iterrows():
        features = row.values[1:]
        yield (row['CampagneId'], features)

def user_feature_generator():
    for _, row in users.iterrows():
        features = row.values[1:]
        yield (row['PersoonId'], features)

In [15]:
item_features = dataset.build_item_features((item_id, item_feature) for item_id, item_feature in item_feature_generator())
user_features = dataset.build_user_features((user_id, user_feature) for user_id, user_feature in user_feature_generator())

In [16]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(seed=SEED)
)

In [17]:
uids, iids, data_interaction = cross_validation._shuffle(interactions.row, interactions.col, interactions.data, np.random.RandomState(SEED))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)
train_idx = slice(None, cutoff)

test_uids, test_iids = uids[test_idx], iids[test_idx]
train_uids, train_iids = uids[train_idx], iids[train_idx]

In [18]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (16688, 1979)
Shape of test interactions: (16688, 1979)


In [19]:
model = LightFM(
    no_components=NO_COMPONENTS,
    learning_rate=LEARNING_RATE,
    random_state=np.random.RandomState(SEED),
    loss=LOSS,
    item_alpha=ITEM_ALPHA,
    user_alpha=USER_ALPHA
)

In [20]:
def save_model(model):
    with open(f'{CHECKPOINT}.pickle', 'wb') as fle:
        pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
train_auc_history = []
test_auc_history = []

best_score = 0

for epoch in tqdm(range(NO_EPOCHS)):
    model.fit_partial(
        interactions=train_interactions,
        user_features=user_features,
        item_features=item_features,
        epochs=NO_EPOCHS,
        num_threads=NO_THREADS
    )

    train_auc = auc_score(model, train_interactions, user_features=user_features, item_features=item_features).mean()
    test_auc = auc_score(model, test_interactions, train_interactions, user_features=user_features, item_features=item_features).mean()

    train_auc_history.append(train_auc)
    test_auc_history.append(test_auc)

    if test_auc > best_score:
        best_score = test_auc
        save_model(model)

    print(f'Epoch {epoch + 1}/{10}, Train AUC: {train_auc}, Test AUC: {test_auc}')

 10%|█         | 1/10 [00:02<00:23,  2.60s/it]

Epoch 1/10, Train AUC: 0.8035188317298889, Test AUC: 0.7822922468185425


 20%|██        | 2/10 [00:05<00:20,  2.57s/it]

Epoch 2/10, Train AUC: 0.8061642646789551, Test AUC: 0.7825275659561157


 30%|███       | 3/10 [00:07<00:18,  2.58s/it]

Epoch 3/10, Train AUC: 0.8081456422805786, Test AUC: 0.782661497592926


 40%|████      | 4/10 [00:10<00:15,  2.58s/it]

Epoch 4/10, Train AUC: 0.8098510503768921, Test AUC: 0.7828184366226196


 50%|█████     | 5/10 [00:12<00:12,  2.53s/it]

Epoch 5/10, Train AUC: 0.8114545941352844, Test AUC: 0.7829868197441101


 60%|██████    | 6/10 [00:15<00:10,  2.52s/it]

Epoch 6/10, Train AUC: 0.8129845261573792, Test AUC: 0.7831478118896484


 70%|███████   | 7/10 [00:17<00:07,  2.49s/it]

Epoch 7/10, Train AUC: 0.8145073056221008, Test AUC: 0.7833260297775269


 80%|████████  | 8/10 [00:20<00:04,  2.49s/it]

Epoch 8/10, Train AUC: 0.8160263895988464, Test AUC: 0.7835315465927124


 90%|█████████ | 9/10 [00:22<00:02,  2.47s/it]

Epoch 9/10, Train AUC: 0.8175684213638306, Test AUC: 0.7837430238723755


100%|██████████| 10/10 [00:25<00:00,  2.51s/it]

Epoch 10/10, Train AUC: 0.8191269636154175, Test AUC: 0.7839761972427368





## 5. Model evalueren

[Interpretatie metrics](https://stackoverflow.com/questions/45451161/evaluating-the-lightfm-recommendation-model/45466481#45466481)

In [22]:
def calculate_metrics(model, test, train, item_features, user_features, k):
    precision = precision_at_k(model=model, test_interactions=test, train_interactions=train, item_features=item_features, user_features=user_features, k=k).mean()
    recall = recall_at_k(model=model, test_interactions=test, train_interactions=train, item_features=item_features, user_features=user_features, k=k).mean()
    auc = auc_score(model=model, test_interactions=test, train_interactions=train, item_features=item_features, user_features=user_features).mean()
    return print('Precision: ', precision, '\nRecall: ', recall, '\nAUC: ', auc)

In [23]:
calculate_metrics(model, test_interactions, train_interactions, item_features, user_features, 1)

Precision:  0.032859974 
Recall:  0.018166053749869508 
AUC:  0.7839762


## 6. Model gebruiken

NOG EENS NAKIJKEN MET DOCUMENTATIE VOOR DE RECOMMEND FUNCTIE

In [24]:
def get_top_items_for_user(user_id):
    # map de persoon van string naar integer
    user_id_internal = dataset.mapping()[0][user_id]

    # Get the internal IDs of all items
    item_ids_internal = np.array(list(dataset.mapping()[2].values()))

    # Predict the scores for all items for the user
    scores = model.predict(user_id_internal, item_ids_internal)

    # Get the indices of the top 5 items
    top_items_indices = np.argsort(-scores)[:5]

    # map de campagnes van integer naar string en geef de top terug
    top_items_ids = [list(dataset.mapping()[2].keys())[i] for i in top_items_indices]

    return f'Top 5 recommended items for user: {top_items_ids}'

# Example usage:
print(get_top_items_for_user('6E42A199-9F70-E911-80FE-001DD8B72B62'))

Top 5 recommended items for user: ['EA3C6FE0-758E-E811-80F3-001DD8B72B61', '317CD023-2B1E-ED11-B83D-000D3AAD783A', 'A2F63005-8EA8-ED11-AAD1-6045BD895D85', 'A82B1F58-6CE3-E911-8106-001DD8B72B62', '2D2494E9-0822-E811-80F0-001DD8B72B61']


In [25]:
def get_top_users_for_item(item_id):
    # map de campagne van string naar integer
    item_id_internal = dataset.mapping()[2][item_id]

    # Get the internal IDs of all users
    user_ids_internal = np.array(list(dataset.mapping()[0].values()))

    # Predict the scores for all users for the item
    scores = model.predict(user_ids_internal, np.repeat(item_id_internal, len(user_ids_internal)))

    # Get the indices of the top 5 users
    top_users_indices = np.argsort(-scores)[:20]

    # Get the external IDs of the top 5 users
    top_users_ids = [list(dataset.mapping()[0].keys())[i] for i in top_users_indices]

    return f'Top 20 recommended users for item: {top_users_ids}'

print(get_top_users_for_item('8FCA1D31-1EB7-E811-80F4-001DD8B72B62'))

Top 20 recommended users for item: ['6DEC4D09-6469-E111-B43A-00505680000A', '563E6E7E-5A69-E111-B43A-00505680000A', 'C1690BC1-EB67-E111-A00F-00505680000A', '4E75FE53-C567-E711-80E8-001DD8B72B61', 'C9B9C7A3-AE67-E111-A00F-00505680000A', '7A707C90-E567-E111-A00F-00505680000A', '1040620F-5769-E111-B43A-00505680000A', '51D5CADB-EC67-E111-A00F-00505680000A', 'CC5144D7-4A16-E411-956D-005056B06EB4', '9A8CF4B0-E667-E111-A00F-00505680000A', '94BCF437-58A5-E411-BDB1-005056B06EC4', '01E816C6-6B69-E111-B43A-00505680000A', 'F38A868E-3A00-E811-80EF-001DD8B72B61', 'C12E2146-1468-E111-A00F-00505680000A', '396A4C1C-1318-E611-8A0E-005056B06EC4', 'F017D190-4D69-E111-B43A-00505680000A', '85C4FCB5-6769-E111-B43A-00505680000A', '2D096ED6-87C3-E211-A980-005056B06EB4', 'C4036697-1368-E111-A00F-00505680000A', '5A0D94FB-6669-E111-B43A-00505680000A']
