In [26]:
import os
import sqlalchemy
import pandas as pd
from dotenv import load_dotenv 
from scipy.sparse import coo_matrix
from sqlalchemy import create_engine, text
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import train_test_split, precision_at_k, AUC_at_k

In [13]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)

try:
    connection = engine.connect()
    print("Successfully connected to the database")
except Exception as e:
    print(f"Failed to connect to the database: {e}")

print(f"SQLAlchemy version: {sqlalchemy.__version__}")

Successfully connected to the database
SQLAlchemy version: 2.0.21


In [14]:
query = text('SELECT * FROM epic_5')
df = pd.read_sql_query(query, connection)
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies,aantal_bezoeken,SessieThema,SoortCampagne,TypeCampagne,ThemaDuurzaamheid,ThemaFinancieelFiscaal,ThemaInnovatie,ThemaInternationaalOndernemen,ThemaMobiliteit,ThemaOmgeving,ThemaSalesMarketingCommunicatie,ThemaStrategieEnAlgemeenManagement,ThemaTalent,ThemaWelzijn
0,DE88423F-8B6E-EA11-8110-001DD8B72B62,BDA1703D-FD3D-EB11-8116-001DD8B72B61,6,0,"Digitalisering, IT & Technologie",Offline,Opleiding,0,0,0,0,0,0,0,0,0,0
1,23DAC82E-384B-E511-A644-005056B06EB4,5300F4E9-CFD0-EA11-8113-001DD8B72B62,5,0,Opvolging en Overname,On en Offline,Opleiding,0,0,0,0,0,0,0,0,0,0
2,86817581-4769-E111-B43A-00505680000A,ADF72EC8-EAFD-E811-80F9-001DD8B72B61,1,0,Netwerking,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0
3,977BE2AA-CBB4-EC11-983F-00224883C751,32E623A5-6C53-EC11-8C62-000D3ABFCF4A,4,0,Algemeen Management,Offline,Opleiding,0,0,0,0,0,0,0,0,0,0
4,8E77DD5A-7276-EB11-811D-001DD8B72B62,C362C6EC-239B-ED11-AAD1-6045BD8956C9,1,0,Algemeen Management,Offline,Netwerkevenement,0,0,0,0,0,0,0,0,0,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52567 entries, 0 to 52566
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   PersoonId                           52567 non-null  object
 1   CampagneId                          52567 non-null  object
 2   aantal_sessies                      52567 non-null  int64 
 3   aantal_bezoeken                     52567 non-null  int64 
 4   SessieThema                         52567 non-null  object
 5   SoortCampagne                       52567 non-null  object
 6   TypeCampagne                        52567 non-null  object
 7   ThemaDuurzaamheid                   52567 non-null  int64 
 8   ThemaFinancieelFiscaal              52567 non-null  int64 
 9   ThemaInnovatie                      52567 non-null  int64 
 10  ThemaInternationaalOndernemen       52567 non-null  int64 
 11  ThemaMobiliteit                     52567 non-null  in

SOURCE: https://benfred.github.io/implicit/index.html

In [16]:
# toch groeperen op aantal sessies per persoon per campagne, geen scheiding tussen sessiethema
df = df.groupby(['PersoonId', 'CampagneId'])['aantal_sessies'].sum().reset_index()

# persoonid en campagneid omzetten naar integers voor de sparse matrix invoer van implicit
persoon_mapping = {id: i for i, id in enumerate(df['PersoonId'].unique())}
campagne_mapping = {id: i for i, id in enumerate(df['CampagneId'].unique())}

# reverse mapping voor later
persoon_reverse_mapping = {i: id for id, i in persoon_mapping.items()}
campagne_reverse_mapping = {i: id for id, i in campagne_mapping.items()}

# mapping toepassen op df
df['PersoonId'] = df['PersoonId'].map(persoon_mapping)
df['CampagneId'] = df['CampagneId'].map(campagne_mapping)

In [25]:
df.head()

Unnamed: 0,PersoonId,CampagneId,aantal_sessies
0,0,0,1
1,0,1,1
2,1,2,1
3,2,3,1
4,3,4,1


In [18]:
# sparse matrix maken met aantal sessies als waarden
campagne_persoon_sessies = coo_matrix((df['aantal_sessies'], (df['CampagneId'], df['PersoonId'])))

In [19]:
# matrix wegen, om hoge aantal sessies minder zwaar te laten wegen
campagne_persoon_sessies = bm25_weight(campagne_persoon_sessies, K1=100, B=0.8)

# matrix transponeren omdat implicit werkt met (user, item) in plaats van (item, user)
persoon_sessies = campagne_persoon_sessies.T.tocsr()

In [20]:
trainset, testset = train_test_split(persoon_sessies, train_percentage=0.8, random_state=42)

In [21]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [23]:
model = AlternatingLeastSquares(factors=128, regularization=0.15, alpha=5.0, iterations=100, random_state=42)
model.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:50<00:00,  2.00it/s]


In [27]:
precision = precision_at_k(model, trainset, testset, K=10, num_threads=4)
auc = a = AUC_at_k(model, trainset, testset, K=10, num_threads=4)

print(f"Precision: {precision}", f"AUC: {auc}", sep="\n")

100%|██████████| 6023/6023 [00:00<00:00, 11495.94it/s]
100%|██████████| 6023/6023 [00:00<00:00, 11672.46it/s]

Precision: 0.29088232201072667
AUC: 0.6318680336264757





In [28]:
def get_similar_items(item_id):
    similar = model.similar_items(item_id, 10+1) 
    return similar[0][1:]

# get_similar_items(5)

In [29]:
def get_similar_users(user_id):
    similar = model.similar_users(user_id, 10+1) 
    return similar[0][1:]
    
# get_similar_users(5)

In [30]:
# WORK IN PROGRESS UNDER THIS LINE

In [31]:
# model recommend
# https://benfred.github.io/implicit/api/models/cpu/als.html#implicit.cpu.als.AlternatingLeastSquares.user_factors

In [32]:
# combineren met recommend van het model
def get_top_users_for_item(string_item_id, N=20):

    item_id = campagne_mapping[string_item_id]

    similar_items = get_similar_items(item_id)

    user_ids = []
    for similar_item in similar_items:
        users_for_item = df[df['CampagneId'] == similar_item]['PersoonId'].unique()
        user_ids.extend(users_for_item)

    user_ids = list(set(user_ids))

    users_for_input_item = df[df['CampagneId'] == item_id]['PersoonId'].unique()
    user_ids = [user_id for user_id in user_ids if user_id not in users_for_input_item]

    string_user_ids = [persoon_reverse_mapping[user_id] for user_id in user_ids]

    return string_user_ids[:N]

In [33]:
get_top_users_for_item('A534E7FC-EEF3-E711-80EE-001DD8B72B62')

['1E7BDC7A-A018-E811-80EF-001DD8B72B61',
 '7E69AB3B-CC1D-E611-8A0E-005056B06EC4',
 '9E3D74E5-7257-E811-80F0-001DD8B72B62',
 '00CBD1B2-A70D-E811-80EE-001DD8B72B62',
 '1F3E2F81-BD01-E811-80EF-001DD8B72B61',
 '5D1998A3-E264-E811-80F1-001DD8B72B62',
 '1F7C2C5D-B40D-E511-ABE8-005056B06EB4',
 'FCD7F0F5-21B7-E111-A45C-00505680000A',
 '9EE1126D-1368-E111-A00F-00505680000A',
 '0146B8DB-824A-E711-80E7-001DD8B72B61',
 '1F944CB3-0E68-E111-A00F-00505680000A',
 '9EFFEA29-B1DF-E511-A303-005056B06EC4',
 '7F25E74D-58C9-E711-80EC-001DD8B72B62',
 '5DC3D3CA-5AB7-E411-9B05-005056B06EB4',
 '3F728C00-9BDA-E711-80EE-001DD8B72B61',
 '01A6A7EE-6469-E111-B43A-00505680000A',
 '3F9DC9E5-1368-E111-A00F-00505680000A',
 '20229EB1-6D9B-E611-80E3-001DD8B72B62',
 '5E20E7B3-E767-E111-A00F-00505680000A',
 '01E08C1A-517E-E611-80DE-001DD8B72B61']

In [34]:
# recommend (omvormen functie), model opslaan
# wat te doen met de campagnesessieinformatie? (populaire campagnes recommenden aan nieuwe gebruikers?)

Handling Cold Start Issues:

    Address the "cold start" problem for new users or items by incorporating demographic information, leveraging item features, or using popularity-based recommendations until sufficient user interactions are available.

Post-Processing Techniques:

    Apply post-processing techniques, such as re-ranking or filtering, to refine the recommendations based on additional criteria or business rules.

In [35]:
# WORK IN PROGRESS ABOVE THIS LINE

In [36]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

print ("Model exported successfully")

Model exported successfully
