In [4]:
!pip install lightfm   #installing lightfm, model used



In [5]:
!pip install matplotlib seaborn



In [6]:
#use 'feature engineered' file for model training, data splitting
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

file_path = '/content/drive/My Drive/PheliswaNontsanga_FNB_DataQuest_Challenge/data/clean_data/fnb_feature_engineered.csv'

df = pd.read_csv(file_path)
df.head()

Mounted at /content/drive


Unnamed: 0,idcol,interaction,int_date,item,page,tod,item_type,item_descrip,segment,beh_segment,active_ind,interaction_score,user_id,item_id
0,755,DISPLAY,17-Jan-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment3,B01,Semi Active,0,0,100
1,4521,DISPLAY,27-Feb-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
2,4521,DISPLAY,18-Feb-23,NONE,Screen1,Afternoon,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
3,4521,DISPLAY,30-Jan-23,NONE,Screen1,Morning,ALL,DISPLAYED ALL ITEMS,segment1,B07,Semi Active,0,1,100
4,4521,CLICK,5-Feb-23,IBAB,Screen1,Afternoon,INSURE,GENERIC MESSAGE,segment1,B07,Semi Active,1,1,76


In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# === model is underperforming , so changed interactions numbers from 1,2 and 3 from the featured engineered file to these for higher precision and recall ===
interaction_map = {'DISPLAY': 0.1, 'CLICK': 1.0, 'CHECKOUT': 3.0}
df['interaction_score'] = df['interaction'].map(interaction_map)

# Encoding user_id and item_id again
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['idcol'])
df['item_id'] = item_encoder.fit_transform(df['item'])

#Prepare user metadata (drop duplicates from previous file or data)
user_features_df = df[['user_id', 'segment', 'beh_segment', 'active_ind', 'tod']].drop_duplicates(subset='user_id')
for col in ['segment', 'beh_segment', 'active_ind', 'tod']:
    user_features_df[col] = user_features_df[col].astype(str)

# === 4. Prepare item metadata (drop duplicates) ===
item_features_df = df[['item_id', 'item_type', 'item_descrip']].drop_duplicates(subset='item_id')
item_features_df['item_type'] = item_features_df['item_type'].astype(str)
item_features_df['item_descrip'] = item_features_df['item_descrip'].astype(str)

# === 5. Vectorize user features ===
user_dicts = user_features_df.drop('user_id', axis=1).to_dict(orient='records')
user_vec = DictVectorizer()
user_features = user_vec.fit_transform(user_dicts)

# === 6a. Vectorize item_type feature (categorical) ===
item_type_vec = DictVectorizer()
item_type_features = item_type_vec.fit_transform(
    item_features_df[['item_type']].to_dict(orient='records')
)

# === 6b. Vectorize item_descrip feature (textual) ===
tfidf_vec = TfidfVectorizer(max_features=300, stop_words='english')
item_desc_features = tfidf_vec.fit_transform(item_features_df['item_descrip'])

# === 6c. Combine item features ===
item_features = hstack([item_type_features, item_desc_features])

# === 7. Create interaction matrix ===
interactions = coo_matrix(
    (df['interaction_score'], (df['user_id'], df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

# === 8. Print shapes to verify ===
print("Interaction matrix shape:", interactions.shape)
print("User features shape:", user_features.shape)
print("Item features shape:", item_features.shape)

Interaction matrix shape: (84375, 104)
User features shape: (84375, 61)
Item features shape: (104, 170)


In [8]:
# ---------------------------------------------
# Redoing and enhancing feature engineering
# ---------------------------------------------

# Reason: Initial feature engineering was either incomplete, redundant,
# or did not contribute positively to model performance (especially for items).
# This version aims to refine and restructure features for better model input.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# === Step 1: Map interaction scores ===
interaction_map = {'DISPLAY': 0.1, 'CLICK': 1.0, 'CHECKOUT': 3.0}
df['interaction_score'] = df['interaction'].map(interaction_map)

# === Step 2: Train-test split on full dataframe ===
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# === Step 3: Label encoding on train_df only ===
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
train_df['user_id'] = user_encoder.fit_transform(train_df['idcol'])
train_df['item_id'] = item_encoder.fit_transform(train_df['item'])

# Apply same transformation to test_df (must filter unseen users/items)
test_df = test_df[
    test_df['idcol'].isin(train_df['idcol']) &
    test_df['item'].isin(train_df['item'])
].copy()
test_df['user_id'] = user_encoder.transform(test_df['idcol'])
test_df['item_id'] = item_encoder.transform(test_df['item'])

# === Step 4: Prepare user features from train_df ===
user_features_df = train_df[['user_id', 'segment', 'beh_segment', 'active_ind', 'tod']].drop_duplicates(subset='user_id')
for col in ['segment', 'beh_segment', 'active_ind', 'tod']:
    user_features_df[col] = user_features_df[col].astype(str)
user_dicts = user_features_df.drop('user_id', axis=1).to_dict(orient='records')
user_vec = DictVectorizer()
user_features = user_vec.fit_transform(user_dicts)

# === Step 5: Prepare item features from train_df ===
item_features_df = train_df[['item_id', 'item_type', 'item_descrip']].drop_duplicates(subset='item_id')
item_features_df['item_type'] = item_features_df['item_type'].astype(str)
item_features_df['item_descrip'] = item_features_df['item_descrip'].astype(str)

# Vectorize categorical item_type
item_type_vec = DictVectorizer()
item_type_features = item_type_vec.fit_transform(item_features_df[['item_type']].to_dict(orient='records'))

# Vectorize textual item_descrip
tfidf_vec = TfidfVectorizer(max_features=300, stop_words='english')
item_desc_features = tfidf_vec.fit_transform(item_features_df['item_descrip'])

# Combine item features
item_features = hstack([item_type_features, item_desc_features])

# === Step 6: Create interaction matrices ===
train_interactions = coo_matrix(
    (train_df['interaction_score'], (train_df['user_id'], train_df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

test_interactions = coo_matrix(
    (test_df['interaction_score'], (test_df['user_id'], test_df['item_id'])),
    shape=(user_features.shape[0], item_features.shape[0])
)

# === Step 7: Train the LightFM model ===
model = LightFM(loss='warp', no_components=100, learning_rate=0.05, user_alpha=1e-5, item_alpha=1e-5)
model.fit(train_interactions,
          user_features=user_features,
          item_features=item_features,
          epochs=50,
          num_threads=4)

#prints message after model is done training
print("Model training complete")

# === Step 8: Evaluate the model on test data ===
precision = precision_at_k(model, test_interactions,
                           user_features=user_features,
                           item_features=item_features,
                           k=10).mean()

recall = recall_at_k(model, test_interactions,
                     user_features=user_features,
                     item_features=item_features,
                     k=10).mean()

print(f" Precision@10: {precision:.4f}")
print(f" Recall@10:    {recall:.4f}")



Model training complete
 Precision@10: 0.0914
 Recall@10:    0.5896


In [9]:
#ndcg

import random
import numpy as np

def fast_ndcg_at_k(model, interactions, user_features=None, item_features=None, k=10, sample_size=1000):
    num_users, num_items = interactions.shape
    ndcg_scores = []

    sampled_users = random.sample(range(num_users), min(sample_size, num_users))

    for user_id in sampled_users:
        row = interactions.tocsr()[user_id]
        if row.nnz == 0:
            continue

        scores = model.predict(user_id, np.arange(num_items),
                               user_features=user_features,
                               item_features=item_features)

        top_k_items = np.argsort(-scores)[:k]
        actual = set(row.indices)

        dcg = sum([1.0 / np.log2(i + 2) for i, item in enumerate(top_k_items) if item in actual])
        ideal_dcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(actual), k))])
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.0

        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)



In [10]:
ndcg_sampled = fast_ndcg_at_k(model, interactions,
                              user_features=user_features,
                              item_features=item_features,
                              k=10, sample_size=1000)

print(f"NDCG@10 (sampled): {ndcg_sampled:.4f}")

Exception: Number of user feature rows does not equal the number of users

In [12]:
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix

def hit_rate_at_k(model, interactions, k=10, user_features=None, item_features=None):
    num_users, num_items = interactions.shape
    hits = 0
    valid_users = 0
    for user_id in range(num_users):
        known_positives = interactions.tocsr()[user_id].indices
        if len(known_positives) == 0:
            continue
        scores = model.predict(user_id, np.arange(num_items), user_features=user_features, item_features=item_features)
        top_k = np.argsort(-scores)[:k]
        if np.intersect1d(top_k, known_positives).size > 0:
            hits += 1
        valid_users += 1
    return hits / valid_users

def item_diversity(item_features):
    similarity = 1 - pairwise_distances(item_features, metric="cosine")
    upper_triangle_indices = np.triu_indices_from(similarity, k=1)
    diversity = 1 - similarity[upper_triangle_indices].mean()
    return diversity

def novelty_score(model, interactions, k=10, user_features=None, item_features=None):
    num_users, num_items = interactions.shape
    item_popularity = np.array(interactions.sum(axis=0)).flatten() + 1e-9
    item_popularity /= item_popularity.sum()
    novelty = 0.0
    valid_users = 0
    for user_id in range(num_users):
        scores = model.predict(user_id, np.arange(num_items), user_features=user_features, item_features=item_features)
        top_k_items = np.argsort(-scores)[:k]
        novelty += -np.sum(np.log2(item_popularity[top_k_items]))
        valid_users += 1
    return novelty / valid_users

# === Run LightFM Evaluation ===
print("Evaluating LightFM metrics for comparison...")
print("Hit Rate@10:", hit_rate_at_k(model, test_interactions, k=10, user_features=user_features, item_features=item_features))
print("Diversity:", item_diversity(item_features.toarray()))
print("Novelty:", novelty_score(model, test_interactions, k=10, user_features=user_features, item_features=item_features))


Evaluating LightFM metrics for comparison...
Hit Rate@10: 0.7820301217247783
Diversity: 0.9092696448372373
Novelty: 49.67958238187579


In [13]:
def evaluate_baseline(recommend_func, test_interactions, k=10):
    hits = 0
    precisions = []
    recalls = []
    num_users, num_items = test_interactions.shape

    for user_id in range(num_users):
        true_items = test_interactions.tocsr()[user_id].indices
        if len(true_items) == 0:
            continue

        recommended = recommend_func(user_id, k)
        hit = len(np.intersect1d(recommended, true_items)) > 0
        hits += int(hit)
        precision = len(np.intersect1d(recommended, true_items)) / k
        recall = len(np.intersect1d(recommended, true_items)) / len(true_items)

        precisions.append(precision)
        recalls.append(recall)

    return {
        'Precision@K': np.mean(precisions),
        'Recall@K': np.mean(recalls),
        'HitRate@K': hits / len(precisions)
    }

In [18]:
def random_recommend(user_id, k):
    all_item_ids = np.arange(test_interactions.shape[1])
    return np.random.choice(all_item_ids, size=k, replace=False)

metrics_random = evaluate_baseline(random_recommend, test_interactions, k=10)
print("\nRandom Recommender Metrics:")
print(metrics_random)

from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(train_interactions.T)

from sklearn.metrics.pairwise import cosine_similarity

# Convert training interactions to CSR
train_csr = train_interactions.tocsr()

# Compute item-item similarity from training data
item_similarity = cosine_similarity(train_csr.T)

# Collaborative filtering recommender
def cf_recommend(user_id, k):
    user_ratings = train_csr[user_id].toarray().flatten()
    scores = item_similarity.dot(user_ratings)
    return np.argsort(-scores)[:k]

# Evaluate
metrics_cf = evaluate_baseline(cf_recommend, test_interactions, k=10)
print("\nCollaborative Filtering Metrics:")
print(metrics_cf)


Random Recommender Metrics:
{'Precision@K': np.float64(0.014751392613988034), 'Recall@K': np.float64(0.09770360717262379), 'HitRate@K': 0.14057664534763772}

Collaborative Filtering Metrics:
{'Precision@K': np.float64(0.13738394883433055), 'Recall@K': np.float64(0.9267832314700298), 'HitRate@K': 0.959949453270064}
