# LightFM hybrid recommender system

Uitgewerkt maar ruw concept van een hybride recommender system met LightFM. Het model heeft last van slechte performantie. 

Iets om te proberen: [GitHub Issue](https://github.com/lyst/lightfm/issues/297)

[Artikel LightFM](https://medium.com/@speedfirefox1/games-recommender-system-using-lightfm-on-steam-dataset-76b05de4c187)

[Documentatie LightFM](https://making.lyst.com/lightfm/docs/index.html)

In [1]:
import os
import sqlalchemy
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import sys
sys.path.append('..')

# pip install lightfm
import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation, evaluation

print("LightFM version: {}".format(lightfm.__version__))



LightFM version: 1.17


In [2]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [3]:
# materialized view
query = text('SELECT * FROM epic_5')

try:
    df = pd.read_sql_query(query, connection)
except Exception as e:
    print(f"Failed to execute query: {e}")

df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,BEDEC9C4-5369-E111-B43A-00505680000A,80DA579E-E5CC-EB11-8120-001DD8B72B61,1,0,Algemeen Management,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
1,4F9EF02B-72A9-EA11-8110-001DD8B72B61,CF6BAEF3-EDC4-EA11-8111-001DD8B72B61,1,0,Internationaal Ondernemen,Online,Opleiding,0,0,0,0,0,0,0,0,0,0
2,A2805188-7369-E111-B43A-00505680000A,8140AAA9-213A-EB11-8118-001DD8B72B62,11,0,Algemeen Management,Offline,Project,0,0,0,0,0,0,0,0,0,0
3,6B7AE2BA-4669-E111-B43A-00505680000A,317CD023-2B1E-ED11-B83D-000D3AAD783A,1,1,Jong Voka,Offline,Netwerkevenement,1,1,1,1,1,1,1,1,1,1
4,0922DBF5-AD67-E111-A00F-00505680000A,FCB9F43A-CDC7-E811-80F7-001DD8B72B61,1,0,Groeien,Offline,Project,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   aantal_bezoeken                     52567 non-null  int64 
 4   SessieThema                         52567 non-null  object
 5   SoortCampagne                       52567 non-null  object
 6   TypeCampagne                        52567 non-null  object
 7   ThemaDuurzaamheid                   52567 non-null  int64 
 8   ThemaFinancieelFiscaal              52567 non-null  int64 
 9   ThemaInnovatie                      52567 non-null  int64 
 10  ThemaInternationaalOndernemen       52567 non-null  int64 
 11  ThemaMobiliteit                     52567 non-null  in

In [5]:
df['ThemaDuurzaamheid'] = df['ThemaDuurzaamheid'].replace({0: 'False', 1: 'True'})
df['ThemaFinancieelFiscaal'] = df['ThemaFinancieelFiscaal'].replace({0: 'False', 1: 'True'})
df['ThemaInnovatie'] = df['ThemaInnovatie'].replace({0: 'False', 1: 'True'})
df['ThemaInternationaalOndernemen'] = df['ThemaInternationaalOndernemen'].replace({0: 'False', 1: 'True'})
df['ThemaMobiliteit'] = df['ThemaMobiliteit'].replace({0: 'False', 1: 'True'})
df['ThemaOmgeving'] = df['ThemaOmgeving'].replace({0: 'False', 1: 'True'})
df['ThemaSalesMarketingCommunicatie'] = df['ThemaSalesMarketingCommunicatie'].replace({0: 'False', 1: 'True'})
df['ThemaStrategieEnAlgemeenManagement'] = df['ThemaStrategieEnAlgemeenManagement'].replace({0: 'False', 1: 'True'})
df['ThemaTalent'] = df['ThemaTalent'].replace({0: 'False', 1: 'True'})
df['ThemaWelzijn'] = df['ThemaWelzijn'].replace({0: 'False', 1: 'True'})

In [6]:
df.drop('aantal_bezoeken', axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,BEDEC9C4-5369-E111-B43A-00505680000A,80DA579E-E5CC-EB11-8120-001DD8B72B61,1,Algemeen Management,Offline,Netwerkevenement,False,False,False,False,False,False,False,False,False,False
1,4F9EF02B-72A9-EA11-8110-001DD8B72B61,CF6BAEF3-EDC4-EA11-8111-001DD8B72B61,1,Internationaal Ondernemen,Online,Opleiding,False,False,False,False,False,False,False,False,False,False
2,A2805188-7369-E111-B43A-00505680000A,8140AAA9-213A-EB11-8118-001DD8B72B62,11,Algemeen Management,Offline,Project,False,False,False,False,False,False,False,False,False,False
3,6B7AE2BA-4669-E111-B43A-00505680000A,317CD023-2B1E-ED11-B83D-000D3AAD783A,1,Jong Voka,Offline,Netwerkevenement,True,True,True,True,True,True,True,True,True,True
4,0922DBF5-AD67-E111-A00F-00505680000A,FCB9F43A-CDC7-E811-80F7-001DD8B72B61,1,Groeien,Offline,Project,False,False,False,False,False,False,False,False,False,False


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   SessieThema                         52567 non-null  object
 4   SoortCampagne                       52567 non-null  object
 5   TypeCampagne                        52567 non-null  object
 6   ThemaDuurzaamheid                   52567 non-null  object
 7   ThemaFinancieelFiscaal              52567 non-null  object
 8   ThemaInnovatie                      52567 non-null  object
 9   ThemaInternationaalOndernemen       52567 non-null  object
 10  ThemaMobiliteit                     52567 non-null  object
 11  ThemaOmgeving                       52567 non-null  ob

In [9]:
item_cols = ['SessieThema', 'SoortCampagne', 'TypeCampagne']
user_cols = ['ThemaDuurzaamheid', 'ThemaFinancieelFiscaal', 'ThemaInnovatie', 'ThemaInternationaalOndernemen', 'ThemaMobiliteit', 'ThemaOmgeving', 'ThemaSalesMarketingCommunicatie', 'ThemaStrategieEnAlgemeenManagement', 'ThemaTalent', 'ThemaWelzijn']

In [10]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score

def calculate_metrics(model, interactions, train, item_features, user_features, k):
    precision = precision_at_k(model, interactions, train, item_features=item_features, user_features=user_features, k=k).mean()
    recall = recall_at_k(model, interactions, train, item_features=item_features, user_features=user_features, k=k).mean()
    auc = auc_score(model, interactions, train, item_features=item_features, user_features=user_features).mean()
    return precision, recall, auc

In [11]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.1
# model learning rate
LEARNING_RATE = 0.01
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 10
# no of threads to fit model
NO_THREADS = 8

# regularisation for both user and item features
ITEM_ALPHA = 0
USER_ALPHA = 0

checkpoint = 'lightFM_hybrid'
# seed for pseudonumber generations
SEED = 42

In [12]:
all_item_features = np.concatenate([df[col].unique() for col in item_cols]).tolist()
all_user_features = np.concatenate([df[col].unique() for col in user_cols]).tolist()

In [13]:
print(all_item_features)
print(all_user_features)

['Algemeen Management', 'Internationaal Ondernemen', 'Jong Voka', 'Groeien', 'Duurzaam Ondernemen', 'Arbeidsmarkt', 'Unknown', 'Netwerking', 'Human Resources', 'Starten', 'Innovatie', 'Marketing & Sales', 'Economie', 'Opvolging en Overname', 'Financieel', 'Opleidingen', 'Bryo', 'Digitalisering, IT & Technologie', 'Haven', 'Lidmaatschap', 'Ruimtelijke ordening en Infrastructuur', 'Strategie', 'Supply Chain', 'Logistiek en Transport', 'Welt 2.0', 'Energie', 'Welt', 'Communicatie', 'Mobiliteit', 'Familiebedrijven', 'Welt 2.0-2023', 'Juridisch', 'Milieu', 'Plato', 'Onderwijs', 'Veiligheid & Preventie', 'Coronavirus', 'Aankoop', 'Persoonlijke vaardigheden', 'Retail', 'Aantrekkelijke regio', 'Offline', 'Online', 'On en Offline', 'Netwerkevenement', 'Opleiding', 'Project', 'Projectgebonden', 'Infosessie', 'Campagne']
['False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'True']


In [14]:
dataset = Dataset()

In [15]:
dataset.fit(
    users=df['PersoonId'].unique(),
    items=df['CampagneId'].unique(),
    user_features=all_user_features,
    item_features=all_item_features
)

# quick check to determine the number of unique users and items in the data
num_users, num_items = dataset.interactions_shape()
print(f'Num users: {num_users}, num_items: {num_items}')

Num users: 16688, num_items: 1979


In [16]:
df = df.groupby(['PersoonId', 'CampagneId'])['aantal_sessies'].sum().reset_index()

In [17]:
(interactions, weights) = dataset.build_interactions(zip(df['PersoonId'], df['CampagneId'], df['aantal_sessies']))

In [18]:
unique_campaign_ids = df['CampagneId'].unique()
items = pd.DataFrame(columns=['CampagneId'])
items['CampagneId'] = unique_campaign_ids

unique_person_ids = df['PersoonId'].unique()
users = pd.DataFrame(columns=['PersoonId'])
users['PersoonId'] = unique_person_ids

In [19]:
def item_feature_generator():
    for i, row in items.iterrows():
        features = row.values[2:-1]
        yield (row['CampagneId'], features)

def user_feature_generator():
    for i, row in users.iterrows():
        features = row.values[1:]
        yield (row['PersoonId'], features)

In [20]:
item_features = dataset.build_item_features((item_id, item_feature) for item_id, item_feature in item_feature_generator())
user_features = dataset.build_user_features((user_id, user_feature) for user_id, user_feature in user_feature_generator())

In [21]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(seed=SEED)
)

In [22]:
uids, iids, data_interaction = cross_validation._shuffle(interactions.row, interactions.col, interactions.data, np.random.RandomState(SEED))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)
train_idx = slice(None, cutoff)

test_uids, test_iids = uids[test_idx], iids[test_idx]
train_uids, train_iids = uids[train_idx], iids[train_idx]

In [23]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (16688, 1979)
Shape of test interactions: (16688, 1979)


In [24]:
model = LightFM(
    no_components=10,
    learning_rate=0.8,
    random_state=np.random.RandomState(SEED),
    loss='logistic',
    item_alpha=ITEM_ALPHA,
    user_alpha=USER_ALPHA
)

In [25]:
def save_model(model):
    with open(f'{checkpoint}.pickle', 'wb') as fle:
        pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
train_history = {
    'AUC': [],
    'Precision': [],
    'Recall': [],
}

test_history = {
    'AUC': [],
    'Precision': [],
    'Recall': [],
}

best_score = 0

for epoch in tqdm(range(NO_EPOCHS)):

    model.fit_partial(
        interactions=train_interactions,
        user_features=user_features,
        item_features=item_features,
        epochs=1,
        num_threads=NO_THREADS
    )

    train_precision, train_recall, train_auc = calculate_metrics(model, train_interactions, None, item_features, user_features, K)
    test_precision, test_recall, test_auc = calculate_metrics(model, test_interactions, train_interactions,  item_features, user_features, K)

    train_history['AUC'].append(train_auc)
    train_history['Precision'].append(train_precision)
    train_history['Recall'].append(train_recall)

    test_history['AUC'].append(test_auc)
    test_history['Precision'].append(test_precision)
    test_history['Recall'].append(test_recall)

    if test_auc > best_score:
        best_score = test_auc
        save_model(model)

    print(f'Epoch {epoch + 1}/{NO_EPOCHS}, Train auc: {train_auc}, Test auc: {test_auc}')

 10%|█         | 1/10 [00:05<00:48,  5.36s/it]

Epoch 1/10, Train auc: 0.7965908646583557, Test auc: 0.77928227186203


 20%|██        | 2/10 [00:10<00:43,  5.41s/it]

Epoch 2/10, Train auc: 0.7988808751106262, Test auc: 0.7794249057769775


 30%|███       | 3/10 [00:16<00:37,  5.36s/it]

Epoch 3/10, Train auc: 0.8001651167869568, Test auc: 0.7794905304908752


 40%|████      | 4/10 [00:21<00:32,  5.37s/it]

Epoch 4/10, Train auc: 0.8010554909706116, Test auc: 0.7794992923736572


 50%|█████     | 5/10 [00:26<00:26,  5.34s/it]

Epoch 5/10, Train auc: 0.801746129989624, Test auc: 0.7794802188873291


 60%|██████    | 6/10 [00:32<00:21,  5.35s/it]

Epoch 6/10, Train auc: 0.8023055791854858, Test auc: 0.7794775366783142


 70%|███████   | 7/10 [00:37<00:16,  5.35s/it]

Epoch 7/10, Train auc: 0.8027992248535156, Test auc: 0.7794659733772278


 80%|████████  | 8/10 [00:42<00:10,  5.34s/it]

Epoch 8/10, Train auc: 0.8032350540161133, Test auc: 0.7794561386108398


 90%|█████████ | 9/10 [00:48<00:05,  5.33s/it]

Epoch 9/10, Train auc: 0.8036407232284546, Test auc: 0.7794396281242371


100%|██████████| 10/10 [00:53<00:00,  5.35s/it]

Epoch 10/10, Train auc: 0.8040143251419067, Test auc: 0.779437780380249





[Interpretatie metrics](https://stackoverflow.com/questions/45451161/evaluating-the-lightfm-recommendation-model/45466481#45466481)

In [27]:
auc = lightfm.evaluation.auc_score(model, test_interactions, user_features=user_features, item_features=item_features).mean()
auc

0.77875996

In [28]:
precision = lightfm.evaluation.precision_at_k(model, test_interactions, user_features=user_features, item_features=item_features, k=1).mean()
precision

0.020615723

In [29]:
recall = lightfm.evaluation.recall_at_k(model, test_interactions, user_features=user_features, item_features=item_features, k=1).mean()
recall

0.014898295766904894

In [30]:
print(f"Shape of interactions matrix: {interactions.shape}")
print(f"Number of non-zero elements: {interactions.nnz}")
print(f'Number of unique users: {len(unique_person_ids)}')
print(f'Number of unique items: {len(unique_campaign_ids)}')
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")
print(f"Number of users in test set: {len(test_uids)}")
print(f"Number of items in test set: {len(test_iids)}")
print(f"Number of non-zero elements in train interactions: {train_interactions.nnz}")
print(f"Number of non-zero elements in test interactions: {test_interactions.nnz}")


Shape of interactions matrix: (16688, 1979)
Number of non-zero elements: 48724
Number of unique users: 16688
Number of unique items: 1979
Shape of train interactions: (16688, 1979)
Shape of test interactions: (16688, 1979)
Number of users in test set: 4873
Number of items in test set: 4873
Number of non-zero elements in train interactions: 43851
Number of non-zero elements in test interactions: 4873


In [31]:
# Get the internal ID of the user
user_id_internal = dataset.mapping()[0]['6E42A199-9F70-E911-80FE-001DD8B72B62']

# Get the internal IDs of all items
item_ids_internal = np.array(list(dataset.mapping()[2].values()))

# Predict the scores for all items for the user
scores = model.predict(user_id_internal, item_ids_internal)

# Get the indices of the top 5 items
top_items_indices = np.argsort(-scores)[:5]

# Get the external IDs of the top 5 items
top_items_ids = [list(dataset.mapping()[2].keys())[i] for i in top_items_indices]

print(f'Top 5 recommended items for user: {top_items_ids}')

Top 5 recommended items for user: ['EA3C6FE0-758E-E811-80F3-001DD8B72B61', '317CD023-2B1E-ED11-B83D-000D3AAD783A', 'A82B1F58-6CE3-E911-8106-001DD8B72B62', 'A2F63005-8EA8-ED11-AAD1-6045BD895D85', '2D2494E9-0822-E811-80F0-001DD8B72B61']


In [32]:
# Assuming `model` is your trained model, and `dataset` is your LightFM Dataset object

# Get the internal ID of the item
item_id_internal = dataset.mapping()[2]['EA3C6FE0-758E-E811-80F3-001DD8B72B61']

# Get the internal IDs of all users
user_ids_internal = np.array(list(dataset.mapping()[0].values()))

# Predict the scores for all users for the item
scores = model.predict(user_ids_internal, np.repeat(item_id_internal, len(user_ids_internal)))

# Get the indices of the top 5 users
top_users_indices = np.argsort(-scores)[:5]

# Get the external IDs of the top 5 users
top_users_ids = [list(dataset.mapping()[0].keys())[i] for i in top_users_indices]

print(f'Top 5 recommended users for item: {top_users_ids}')

Top 5 recommended users for item: ['C1690BC1-EB67-E111-A00F-00505680000A', '4E75FE53-C567-E711-80E8-001DD8B72B61', '6DEC4D09-6469-E111-B43A-00505680000A', '1040620F-5769-E111-B43A-00505680000A', 'C9B9C7A3-AE67-E111-A00F-00505680000A']
