In [1]:
# Importing Libraries
import random
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# change current directory
os.chdir("/content/drive/MyDrive/H&M_Personalized_Recommendations")

In [4]:
# Load the articles data
articles = pd.read_csv('data/articles.csv')

In [5]:
# Load the transactions data
transactions = pd.read_csv('data/transactions_train.csv')

In [6]:
# Load the customers data
customers = pd.read_csv('data/customers.csv')

## Item Based Collaborative Filtering

### Helper Functions

In [7]:
# Creating a user-item interaction matrix
def create_sparse_user_item_matrix(transactions):
    user_ids = transactions['customer_id'].astype('category').cat.codes
    item_ids = transactions['article_id'].astype('category').cat.codes

    user_id_map = dict(enumerate(transactions['customer_id'].astype('category').cat.categories))
    item_id_map = dict(enumerate(transactions['article_id'].astype('category').cat.categories))

    data = np.ones(len(transactions))

    sparse_matrix = csr_matrix((data, (user_ids, item_ids)))
    return sparse_matrix, user_id_map, item_id_map

In [8]:
# Computing item-item similarity using cosine similarity
def calculate_item_similarity(sparse_matrix):
    item_similarity = cosine_similarity(sparse_matrix.T)
    return item_similarity

In [9]:
# Generating recommendations based on similar items
def get_item_recommendations(customer,user_id_map, sparse_matrix, item_similarity,articles=articles,n=12):

    if customer in user_id_map.values():
        user_index = list(user_id_map.values()).index(customer)
        user_vector = sparse_matrix[user_index].toarray().flatten()
        purchased_items = user_vector.nonzero()[0]

        # For existing customers, use collaborative filtering
        scores = item_similarity[purchased_items].sum(axis=0)
        scores[purchased_items] = -1  # Exclude already purchased items

        # Get top N recommendations
        top_items = scores.argsort()[-n:][::-1]

        recommendations_df = articles[articles['article_id'].isin([item_id_map[i] for i in top_items])].copy()
        recommendations_df = recommendations_df.reset_index()

        return recommendations_df[['article_id','prod_name', 'product_type_name']]

    else:
        #For new customers
        # Calculate item popularity from the sparse matrix
        popular_items = sparse_matrix.sum(axis=0).A1  # Sum across all users

        # Get top N most popular items
        top_items = popular_items.argsort()[-n:][::-1]

        recommendations_df = articles[articles['article_id'].isin([item_id_map[i] for i in top_items])].copy()
        recommendations_df = recommendations_df.reset_index()

        return recommendations_df[['article_id','prod_name', 'product_type_name']]




In [10]:
# Using MAP@12 for evaluation
def evaluate_recommendations(true_purchases, recommendations):
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(recommendations, 1):
        if p in true_purchases and p not in recommendations[:i-1]:
            num_hits += 1.0
            score += num_hits/i

    return score / min(len(true_purchases),12)

### Implement

In [11]:
# Split the data into train & test
train_start_date = '2020-08-11'
train_end_date = '2020-09-15'
train_data = transactions[(transactions['t_dat'] >= train_start_date) & (transactions['t_dat'] <= train_end_date)]
test_data = transactions[transactions['t_dat'] > train_end_date]

In [13]:
# Create sparse user-item matrix from training data
sparse_matrix, user_id_map, item_id_map = create_sparse_user_item_matrix(train_data)

In [14]:
# Calculate item similarity
item_similarity = calculate_item_similarity(sparse_matrix)

#### Get recommendations for existing customer

In [18]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find common customers
common_customers = list(train_customers.intersection(test_customers))

In [19]:
print(common_customers)

Output hidden; open in https://colab.research.google.com to view.

In [20]:
# Recommended items
recommendations = get_item_recommendations('981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3', user_id_map,sparse_matrix, item_similarity)
recommended_items = recommendations['article_id']

for id in recommended_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

624486090 T-shirt
749615006 Trousers
755759002 Cardigan
826164004 Dress
837776002 Trousers
839194001 T-shirt
852536002 Top
879384001 Jacket
880312003 T-shirt
880312005 T-shirt
881940003 Vest top
920389001 Dress


In [21]:
#purchases made during training period
pur_items = train_data[train_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312004 T-shirt


In [22]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

547780040 Trousers
853589004 T-shirt
818754007 Top
806388018 T-shirt
855239003 Sweater
806388018 T-shirt
759814020 Trousers


#### Get recommendations for new customer

In [23]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find customers that are in test but not in train
new_customers = list(test_customers - train_customers)

In [24]:
print(new_customers)

Output hidden; open in https://colab.research.google.com to view.

In [25]:
# Recommended items
recommendations = get_item_recommendations('4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf', user_id_map,sparse_matrix, item_similarity)
recommended_items = recommendations['article_id']

for id in recommended_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

448509014 Trousers
706016001 Trousers
751471001 Trousers
751471043 Trousers
850917001 Shirt
863595006 Cardigan
896152002 T-shirt
915526001 Sweater
915526002 Sweater
915529003 Sweater
916468003 Cardigan
918292001 Leggings/Tights


In [26]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

673677027 Sweater


#### Evaluate on Test set

In [15]:
customers = test_data['customer_id'].unique()
batch_size=5000
# Create batches
n_batches = len(customers) // batch_size + (1 if len(customers) % batch_size != 0 else 0)
customer_batches = np.array_split(customers, n_batches)

all_map_scores = []

In [16]:
for batch in tqdm(customer_batches, desc="Processing customer batches"):
    batch_map_scores = []
    for customer in batch:
        recommendations = get_item_recommendations(customer, user_id_map,sparse_matrix, item_similarity)
        recommended_items = recommendations['article_id']

        customer_purchases = test_data[test_data['customer_id'] == customer]['article_id'].tolist()
        map_score = evaluate_recommendations(customer_purchases, recommended_items)
        batch_map_scores.append(map_score)

    # Extend the main list with batch results
    all_map_scores.extend(batch_map_scores)

    # Print intermediate results for monitoring
    batch_average = sum(batch_map_scores) / len(batch_map_scores) if batch_map_scores else 0
    print(f"Batch average MAP@12: {batch_average:.4f}")

Processing customer batches:   7%|▋         | 1/14 [02:58<38:37, 178.30s/it]

Batch average MAP@12: 0.0060


Processing customer batches:  14%|█▍        | 2/14 [06:14<37:43, 188.62s/it]

Batch average MAP@12: 0.0056


Processing customer batches:  21%|██▏       | 3/14 [09:11<33:39, 183.63s/it]

Batch average MAP@12: 0.0056


Processing customer batches:  29%|██▊       | 4/14 [12:33<31:47, 190.77s/it]

Batch average MAP@12: 0.0058


Processing customer batches:  36%|███▌      | 5/14 [15:42<28:30, 190.04s/it]

Batch average MAP@12: 0.0048


Processing customer batches:  43%|████▎     | 6/14 [18:46<25:04, 188.12s/it]

Batch average MAP@12: 0.0055


Processing customer batches:  50%|█████     | 7/14 [22:07<22:25, 192.26s/it]

Batch average MAP@12: 0.0047


Processing customer batches:  57%|█████▋    | 8/14 [25:08<18:51, 188.61s/it]

Batch average MAP@12: 0.0040


Processing customer batches:  64%|██████▍   | 9/14 [28:32<16:07, 193.48s/it]

Batch average MAP@12: 0.0045


Processing customer batches:  71%|███████▏  | 10/14 [31:32<12:37, 189.37s/it]

Batch average MAP@12: 0.0048


Processing customer batches:  79%|███████▊  | 11/14 [34:45<09:31, 190.42s/it]

Batch average MAP@12: 0.0052


Processing customer batches:  86%|████████▌ | 12/14 [37:52<06:18, 189.48s/it]

Batch average MAP@12: 0.0056


Processing customer batches:  93%|█████████▎| 13/14 [40:57<03:08, 188.08s/it]

Batch average MAP@12: 0.0062


Processing customer batches: 100%|██████████| 14/14 [44:15<00:00, 189.67s/it]

Batch average MAP@12: 0.0052





In [17]:
# Calculate final average MAP@12
average_map12 = sum(all_map_scores) / len(all_map_scores) if all_map_scores else 0
print(f"\nAverage MAP@12 score across test set: {average_map12:.4f}")


Average MAP@12 score across test set: 0.0052
