In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load dataset
url = "https://s3.ap-south-1.amazonaws.com/production-mentormind/resources/files/000/005/818/original/Dataset_e-shop_clothing.csv?1722328461"
df = pd.read_csv(url, sep=';')   # correct delimiter

print("shape:", df.shape)
display(df.head())

shape: (165474, 14)


Unnamed: 0,year,month,day,order,country,session ID,page 1 (main category),page 2 (clothing model),colour,location,model photography,price,price 2,page
0,2008,4,1,1,29,1,1,A13,1,5,1,28,2,1
1,2008,4,1,2,29,1,1,A16,1,6,1,33,2,1
2,2008,4,1,3,29,1,2,B4,10,2,1,52,1,1
3,2008,4,1,4,29,1,2,B17,6,6,2,38,2,1
4,2008,4,1,5,29,1,2,B8,4,3,2,52,1,1


In [2]:
user_col = "session ID"
item_col = "page 2 (clothing model)"
price_col = "price"   

df[item_col] = df[item_col].astype(str)

# Interaction weight: 1 for view, plus extra for purchase bias (optional)
df['interaction'] = 1
df.loc[df[price_col] > 0, 'interaction'] += 1   # +1 if price>0 (tunable)

# Aggregate to user-item matrix (counts)
ui = df.groupby([user_col, item_col])['interaction'].sum().reset_index()
ui.rename(columns={'interaction':'weight'}, inplace=True)
display(ui.head())

Unnamed: 0,session ID,page 2 (clothing model),weight
0,1,A13,2
1,1,A16,2
2,1,B17,2
3,1,B4,2
4,1,B8,2


In [3]:
from sklearn.model_selection import train_test_split

def leave_one_out(df_ui, user_col='session ID', item_col='page 2 (clothing model)'):
    # For each user,one random interaction for test
    rng = np.random.RandomState(42)
    test_rows = []
    train_rows = []
    for uid, g in df_ui.groupby(user_col):
        if len(g) == 1:
            train_rows.append(g.index.values[0])
        else:
            # pick random index for test
            test_idx = rng.choice(g.index.values, 1)[0]
            test_rows.append(test_idx)
            train_rows.extend([i for i in g.index.values if i != test_idx])
    train = df_ui.loc[train_rows].reset_index(drop=True)
    test = df_ui.loc[test_rows].reset_index(drop=True)
    return train, test

train_ui, test_ui = leave_one_out(ui, user_col=user_col, item_col=item_col)
print("train:", train_ui.shape, "test:", test_ui.shape)

train: (129775, 3) test: (18570, 3)


In [4]:
top_items = train_ui.groupby(item_col)['weight'].sum().sort_values(ascending=False).index.tolist()

def recommend_popular(user_id, k=10):
    return top_items[:k]

# Example
print("Top 10 popular items:", top_items[:10])

Top 10 popular items: ['B4', 'A2', 'A11', 'P1', 'B10', 'A4', 'A15', 'A5', 'A10', 'A1']


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Create item-user matrix (items x users) for similarity calculation
users = ui[user_col].unique().tolist()
items = ui[item_col].unique().tolist()
user_index = {u:i for i,u in enumerate(users)}
item_index = {it:i for i,it in enumerate(items)}

rows = [item_index[it] for it in ui[item_col]]
cols = [user_index[u] for u in ui[user_col]]
data = ui['weight'].astype(float).values
mat = csr_matrix((data, (rows, cols)), shape=(len(items), len(users)))  # item x user

# cosine similarity between items
item_sim = cosine_similarity(mat, dense_output=False)  # might be large but OK for moderate item counts

# recommend function: for a given user, score items by similarity to items they interacted with
def recommend_item_item(user_id, k=10):
    # items user interacted in train
    user_items = train_ui[train_ui[user_col]==user_id][item_col].tolist()
    if not user_items:
        return recommend_popular(user_id, k)
    scores = np.zeros(len(items))
    for it in user_items:
        idx = item_index.get(it)
        if idx is None: continue
        # add similarity scores (sum)
        scores += np.array(item_sim[idx].todense()).reshape(-1)
    # zero-out items already seen
    seen_idx = [item_index[it] for it in user_items if it in item_index]
    scores[seen_idx] = -np.inf
    rec_idx = np.argsort(-scores)[:k]
    return [items[i] for i in rec_idx]

# Example recommend
some_user = train_ui[user_col].iloc[0]
print("Recommendations (item-item) for user", some_user, ":", recommend_item_item(some_user, k=10))

Recommendations (item-item) for user 1 : ['A21', 'A15', 'A11', 'B1', 'B16', 'C57', 'B3', 'A18', 'B2', 'A33']


In [6]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split as s_train_test_split

# Prepare surprise dataset (we need a rating column; use 'weight')
reader = Reader(rating_scale=(ui['weight'].min(), ui['weight'].max()))
data = Dataset.load_from_df(train_ui[[user_col, item_col, 'weight']], reader)
trainset = data.build_full_trainset()
# build validation/testset from our test_ui
testset = list(test_ui[[user_col, item_col, 'weight']].itertuples(index=False, name=None))

# train SVD
algo = SVD(n_factors=50, n_epochs=20, random_state=42)
algo.fit(trainset)
# Predict on testset
predictions = algo.test(testset)
print("Sample prediction:", predictions[:3])
# Compute RMSE (not ideal for implicit but gives a number)
print("RMSE:", accuracy.rmse(predictions))

Sample prediction: [Prediction(uid=1, iid='C57', r_ui=2, est=2, details={'was_impossible': False}), Prediction(uid=2, iid='B27', r_ui=4, est=2.109546296152035, details={'was_impossible': False}), Prediction(uid=3, iid='C7', r_ui=2, est=2, details={'was_impossible': False})]
RMSE: 0.7497
RMSE: 0.7497193732676399


In [21]:
# Convert IDs to string to avoid mapping issues
df[user_col] = df[user_col].astype(str)
df[item_col] = df[item_col].astype(str)

# Interaction weight: 1 for view, +1 if purchased (price>0)
df['interaction'] = 1
df.loc[df[price_col] > 0, 'interaction'] += 1

# Aggregate by user-item
ui = df.groupby([user_col, item_col])['interaction'].sum().reset_index()
ui.rename(columns={'interaction':'weight'}, inplace=True)
display(ui.head())

users = ui[user_col].unique()
items = ui[item_col].unique()

user_index = {u:i for i,u in enumerate(users)}
item_index = {it:i for i,it in enumerate(items)}

rows = [user_index[u] for u in ui[user_col]]
cols = [item_index[it] for it in ui[item_col]]
data = ui['weight'].values

user_item_mat = csr_matrix((data, (rows, cols)), shape=(len(users), len(items)))
print("User-Item matrix shape:", user_item_mat.shape)

# Train ALS Model

model = implicit.als.AlternatingLeastSquares(factors=20, 
                                             regularization=0.1, 
                                             iterations=20)

# Fit model on users x items
model.fit(user_item_mat)


# Recommendation Function

def recommend_als(user_id, k=10):
    uid = str(user_id)
    if uid not in user_index:
        return []  # fallback if user not in training set
    uidx = user_index[uid]
    recommended_items, scores = model.recommend(
        userid=uidx,
        user_items=user_item_mat,
        N=k,
        filter_already_liked_items=False
    )
    # Map indices back to item IDs
    return [items[i] for i in recommended_items]

# Example Recommendation

some_user = df[user_col].iloc[0]  # pick first user
print(f"ALS Recommendations for User {some_user}:", recommend_als(some_user, 10))

Unnamed: 0,session ID,page 2 (clothing model),weight
0,1,A13,2
1,1,A16,2
2,1,B17,2
3,1,B4,2
4,1,B8,2


User-Item matrix shape: (24026, 217)


  0%|          | 0/20 [00:00<?, ?it/s]

ALS Recommendations for User 1: ['B4', 'A13', 'A16', 'B17', 'C56', 'B1', 'C57', 'A21', 'B16', 'C55']


In [23]:
import math

def apk(actual, predicted, k):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual_list, predicted_list, k):
    return np.mean([apk(a,p,k) for a,p in zip(actual_list, predicted_list)])

def precision_at_k(actual, predicted, k):
    return np.mean([len(set(p[:k]) & set(a)) / k for a,p in zip(actual,predicted)])

def recall_at_k(actual, predicted, k):
    return np.mean([len(set(p[:k]) & set(a)) / len(a) if len(a)>0 else 0 for a,p in zip(actual,predicted)])

test_truth = test_ui.groupby(user_col)[item_col].apply(list).to_dict()

def evaluate_recommender(rec_func, users, k=10):
    actual_list = []
    predicted_list = []
    for u in users:
        actual = test_truth.get(u, [])
        predicted = rec_func(u, k)
        actual_list.append(actual)
        predicted_list.append(predicted)
    return {
        'MAP@K': mapk(actual_list, predicted_list, k),
        'Prec@K': precision_at_k(actual_list, predicted_list, k),
        'Recall@K': recall_at_k(actual_list, predicted_list, k)
    }

# Evaluate item-item
eval_users = list(test_truth.keys())
print("Evaluating item-item ...")
print(evaluate_recommender(recommend_item_item, eval_users, k=10))

# Evaluate popularity baseline
print("Evaluating popularity ...")
print(evaluate_recommender(lambda u,k: recommend_popular(u,k), eval_users, k=10))

Evaluating item-item ...
{'MAP@K': 0.06120688770931098, 'Prec@K': 0.017964458804523426, 'Recall@K': 0.17964458804523425}
Evaluating popularity ...
{'MAP@K': 0.06120688770931098, 'Prec@K': 0.017964458804523426, 'Recall@K': 0.17964458804523425}


In [24]:
# Cell I: CV for SVD (Surprise)
from surprise.model_selection import cross_validate, GridSearchCV

param_grid = {'n_epochs':[10,20], 'n_factors':[20,50]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
data_all = Dataset.load_from_df(ui[[user_col, item_col, 'weight']], reader)
gs.fit(data_all)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8605721249535874
{'n_epochs': 20, 'n_factors': 20}


In [28]:
out = []
for u in eval_users:
    out.append({
        'session_id': u,
        'popular_recs': ",".join(recommend_popular(u, 10)),
        'item_item_recs': ",".join(recommend_item_item(u, 10))
    })
pd.DataFrame(out).to_csv("recommendations_examples.csv", index=False)
print("Saved recommendations_examples.csv")

Saved recommendations_examples.csv
