In [None]:
!pip install lightfm   #installing lightfm, model used

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m174.1/316.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831163 sha256=db34b4a136ed4851f96b9552ab51aed3a964152d6c1e5312d465035f54878fb2
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [None]:
!pip install matplotlib seaborn



In [None]:
#use 'feature engineered' file for model training, data splitting

import pandas as pd

file_path = 'C:/users/feature_engineered.csv'

df = pd.read_csv(file_path)
df.head()

Mounted at /content/drive


Unnamed: 0,idcol,interaction,int_date,item,page,tod,item_type,item_descrip,segment,beh_segment,active_ind,interaction_score,user_id,item_id
0,755,DISPLAY,17-Jan-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment3,B01,Semi Active,0,0,100
1,4521,DISPLAY,27-Feb-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
2,4521,DISPLAY,18-Feb-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
3,4521,DISPLAY,30-Jan-23,NONE,Screen1,Morning,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
4,4521,CLICK,5-Feb-23,IBAB,Screen1,Afternoon,INSURE,GENERIC MESSAGE,segment1,B07,Semi Active,1,1,76


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# === model is underperforming , so changed interactions numbers from 1,2 and 3 from the featured engineered file to these for higher precision and recall ===
interaction_map = {'DISPLAY': 0.1, 'CLICK': 1.0, 'CHECKOUT': 3.0}
df['interaction_score'] = df['interaction'].map(interaction_map)

# Encoding user_id and item_id again
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['idcol'])
df['item_id'] = item_encoder.fit_transform(df['item'])

#Prepare user metadata (drop duplicates from previous file or data)
user_features_df = df[['user_id', 'segment', 'beh_segment', 'active_ind', 'tod']].drop_duplicates(subset='user_id')
for col in ['segment', 'beh_segment', 'active_ind', 'tod']:
    user_features_df[col] = user_features_df[col].astype(str)

# Prepare item metadata
item_features_df = df[['item_id', 'item_type', 'item_descrip']].drop_duplicates(subset='item_id')
item_features_df['item_type'] = item_features_df['item_type'].astype(str)
item_features_df['item_descrip'] = item_features_df['item_descrip'].astype(str)

#Vectorize user features
user_dicts = user_features_df.drop('user_id', axis=1).to_dict(orient='records')
user_vec = DictVectorizer()
user_features = user_vec.fit_transform(user_dicts)

# Vectorize item_type feature
item_type_vec = DictVectorizer()
item_type_features = item_type_vec.fit_transform(
    item_features_df[['item_type']].to_dict(orient='records')
)

#  Vectorize item_descrip feature
tfidf_vec = TfidfVectorizer(max_features=300, stop_words='english')
item_desc_features = tfidf_vec.fit_transform(item_features_df['item_descrip'])

#Combine item features
item_features = hstack([item_type_features, item_desc_features])

# Create interaction matrix
interactions = coo_matrix(
    (df['interaction_score'], (df['user_id'], df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

# Print shapes to verify
print("Interaction matrix shape:", interactions.shape)
print("User features shape:", user_features.shape)
print("Item features shape:", item_features.shape)

Interaction matrix shape: (84375, 104)
User features shape: (84375, 61)
Item features shape: (104, 170)


In [None]:
# ---------------------------------------------
# Redoing and enhancing feature engineering
# ---------------------------------------------

# Reason: Initial feature engineering was either incomplete, redundant,
# or did not contribute positively to model performance (especially for items).
# This version aims to refine and restructure features for better model input.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# Mapping interaction scores ===
interaction_map = {'DISPLAY': 0.1, 'CLICK': 1.0, 'CHECKOUT': 3.0}
df['interaction_score'] = df['interaction'].map(interaction_map)

# Train-test split on full dataframe
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Label encoding on train_df only, training test
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
train_df['user_id'] = user_encoder.fit_transform(train_df['idcol'])
train_df['item_id'] = item_encoder.fit_transform(train_df['item'])

# Apply same transformation to test_df , this so that it filters unseen users/items)
test_df = test_df[
    test_df['idcol'].isin(train_df['idcol']) &
    test_df['item'].isin(train_df['item'])
].copy()
test_df['user_id'] = user_encoder.transform(test_df['idcol'])
test_df['item_id'] = item_encoder.transform(test_df['item'])

#Prepare user features from train_df
user_features_df = train_df[['user_id', 'segment', 'beh_segment', 'active_ind', 'tod']].drop_duplicates(subset='user_id')
for col in ['segment', 'beh_segment', 'active_ind', 'tod']:
    user_features_df[col] = user_features_df[col].astype(str)
user_dicts = user_features_df.drop('user_id', axis=1).to_dict(orient='records')
user_vec = DictVectorizer()
user_features = user_vec.fit_transform(user_dicts)

# Prepare item features from train_df
item_features_df = train_df[['item_id', 'item_type', 'item_descrip']].drop_duplicates(subset='item_id')
item_features_df['item_type'] = item_features_df['item_type'].astype(str)
item_features_df['item_descrip'] = item_features_df['item_descrip'].astype(str)

# Vectorize categorical item_type
item_type_vec = DictVectorizer()
item_type_features = item_type_vec.fit_transform(item_features_df[['item_type']].to_dict(orient='records'))

# Vectorize textual item_descrip
tfidf_vec = TfidfVectorizer(max_features=300, stop_words='english')
item_desc_features = tfidf_vec.fit_transform(item_features_df['item_descrip'])

# Combine item features
item_features = hstack([item_type_features, item_desc_features])

# Create interaction matrices
train_interactions = coo_matrix(
    (train_df['interaction_score'], (train_df['user_id'], train_df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

test_interactions = coo_matrix(
    (test_df['interaction_score'], (test_df['user_id'], test_df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

# Training the LightFM model , so it can be evaluated against other models
# before offering personalised recommendations
model = LightFM(loss='warp', no_components=100, learning_rate=0.05, user_alpha=1e-5, item_alpha=1e-5)
model.fit(train_interactions,
          user_features=user_features,
          item_features=item_features,
          epochs=50,
          num_threads=4)

#prints message after model is done training
print("Model training complete")

#evaluates the model on test data ===
precision = precision_at_k(model, test_interactions,
                           user_features=user_features,
                           item_features=item_features,
                           k=10).mean()

recall = recall_at_k(model, test_interactions,
                     user_features=user_features,
                     item_features=item_features,
                     k=10).mean()

print(f" Precision@10: {precision:.4f}")
print(f" Recall@10:    {recall:.4f}")



Model training complete
 Precision@10: 0.0913
 Recall@10:    0.5890


In [None]:
#This is for beyond-accuracy measures , testing or evaluating nDCG

import random
import numpy as np

def fast_ndcg_at_k(model, interactions, user_features=None, item_features=None, k=10, sample_size=1000):
    num_users, num_items = interactions.shape
    ndcg_scores = []

    sampled_users = random.sample(range(num_users), min(sample_size, num_users))

    for user_id in sampled_users:
        row = interactions.tocsr()[user_id]
        if row.nnz == 0:
            continue

        scores = model.predict(user_id, np.arange(num_items),
                               user_features=user_features,
                               item_features=item_features)

        top_k_items = np.argsort(-scores)[:k]
        actual = set(row.indices)

        dcg = sum([1.0 / np.log2(i + 2) for i, item in enumerate(top_k_items) if item in actual])
        ideal_dcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(actual), k))])
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.0

        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)



In [None]:
ndcg_sampled = fast_ndcg_at_k(model, interactions,
                              user_features=user_features,
                              item_features=item_features,
                              k=10, sample_size=1000)

print(f"NDCG@10 (sampled): {ndcg_sampled:.4f}")

NDCG@10 (sampled): 0.7738


In [None]:
#After comparison with other models
#offer personalised recommendations and save them in a file

import numpy as np
import pandas as pd

# Filter full df to match training set's known users and items
df_filtered = df[df['idcol'].isin(train_df['idcol']) & df['item'].isin(train_df['item'])].copy()
df_filtered['user_id'] = user_encoder.transform(df_filtered['idcol'])
df_filtered['item_id'] = item_encoder.transform(df_filtered['item'])

all_user_ids = df_filtered['user_id'].unique()
all_item_ids = np.arange(item_features.shape[0])  # assume full range of encoded items

# Recommendation generation
def generate_recommendations(model, user_ids, item_ids, user_features, item_features, top_n=10):
    recommendations = {}
    for user_id in user_ids:
        user_array = np.repeat(user_id, len(item_ids))
        scores = model.predict(user_array, item_ids, user_features=user_features, item_features=item_features)
        top_items = np.argsort(-scores)[:top_n]
        recommendations[user_id] = top_items
    return recommendations

recommendations = generate_recommendations(model, all_user_ids, all_item_ids, user_features, item_features)

# Reverse map to original ids
reverse_user_map = dict(df_filtered[['user_id', 'idcol']].drop_duplicates().values)
reverse_item_map = dict(df_filtered[['item_id', 'item']].drop_duplicates().values)

# Convert to output
personalized_output = {
    reverse_user_map[user]: [reverse_item_map[item] for item in items]
    for user, items in recommendations.items()
}

# Save to CSV
rows = []
for user, items in personalized_output.items():
    for rank, item in enumerate(items, start=1):
        rows.append({'user_id': user, 'item_id': item, 'rank': rank})

rec_df = pd.DataFrame(rows)
file_path = '/content/drive/My Drive/PheliswaNontsanga_FNB_DataQuest_Challenge/personalized_recommendations.csv'
rec_df.to_csv(file_path, index=False)