In [None]:
# Importing Libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# change current directory
os.chdir("/content/drive/MyDrive/H&M_Personalized_Recommendations")

In [None]:
# Load the articles data
articles = pd.read_csv('data/articles.csv')

In [None]:
# Load the transactions data
transactions = pd.read_csv('data/transactions_train.csv')

In [None]:
# Load the customers data
customers = pd.read_csv('data/customers.csv')

## Content-Based Recommender using TF-IDF & cosine similarity

### Helper Functions

In [None]:
# Clean and preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    return ''

In [None]:
# Prepare the description data, create TF-IDF matrix and calc. cosine similarity
def prepare_content_similarity(articles_df):

    # Preprocess descriptions
    processed_desc = articles_df['detail_desc'].apply(preprocess_text)

    # Create TF-IDF vectors
    tfidf = TfidfVectorizer(stop_words='english',max_features=5000)
    tfidf_matrix = tfidf.fit_transform(processed_desc)

    cs = cosine_similarity(
                tfidf_matrix,
                tfidf_matrix)

    return cs

In [None]:
def get_customer_purchases(transactions_df,customer_id):
    # Get customer's purchase history
    customer_purchases = transactions_df[
        transactions_df['customer_id'] == customer_id
    ]['article_id'].unique()

    return customer_purchases

In [None]:
# Calculate average similarity scores based on customer's purchase history
def calc_avg_similarity_score(customer_id, customer_purchases, articles_df, transactions_df, cosine_sim):

    if len(customer_purchases) == 0:
        return None

    # Initialize similarity scores array
    similarity_scores = np.zeros(cosine_sim.shape[0])
    valid_purchases = 0

    # Calculate similarities for each purchased item
    for article_id in customer_purchases:
        # Get article index in the articles dataframe
        article_idx = articles_df[articles_df['article_id'] == article_id].index

        if len(article_idx) > 0:
            # Calculate cosine similarity with all other items
            sim_scores = cosine_sim[article_idx[0]]
            # Add to total similarity scores
            similarity_scores += sim_scores
            valid_purchases += 1

    # Average the similarity scores
    similarity_scores = similarity_scores / valid_purchases

    # Return average similarity scores if there are valid purchases
    if valid_purchases > 0:
        return similarity_scores
    return None


In [None]:
# Get top N recommendations based on similarity scores
def get_top_recommendations(customer_purchases, similarity_scores,transactions_df,articles_df,n_recommendations=12):

    if len(customer_purchases) == 0 and similarity_scores is None:
         # Calculate purchase counts for each article
        popular_items = transactions['article_id'].value_counts().reset_index()
        popular_items.columns = ['article_id', 'purchase_count']

        # Merge with article details
        recommendations = popular_items.merge(
        articles_df,
        on='article_id'
        ).head(n_recommendations)

        recommendations = recommendations.reset_index()
        return recommendations[['article_id', 'prod_name', 'product_type_name']]

    # Get recommendations
    recommended_indices = []
    scores = []

    # Sort indices by similarity score (highest to lowest)
    sorted_indices = np.argsort(similarity_scores)[::-1]

    # Filter and collect top recommendations
    for idx in sorted_indices:
        article_id = articles_df.iloc[idx]['article_id']
        if article_id not in customer_purchases:
            recommended_indices.append(idx)
            scores.append(similarity_scores[idx])
            if len(recommended_indices) == n_recommendations:
                break

    # Create final recommendations dataframe
    recommendations = articles_df.iloc[recommended_indices][
        ['article_id', 'prod_name','product_type_name']
    ].copy()
    recommendations['similarity_score'] = scores

    return recommendations

In [None]:
# Using MAP@12 for evaluation
def evaluate_recommendations(true_purchases, recommendations):
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(recommendations, 1):
        if p in true_purchases and p not in recommendations[:i-1]:
            num_hits += 1.0
            score += num_hits/i

    return score / min(len(true_purchases),12)

### Implement

In [None]:
train_start_date = '2020-08-11'
train_end_date = '2020-09-15'
train_data = transactions[(transactions['t_dat'] > train_start_date) & (transactions['t_dat'] <= train_end_date)]
test_data = transactions[transactions['t_dat'] > train_end_date]

In [None]:
similarity_matrix = prepare_content_similarity(articles)

#### Get recommendations for existing customer

In [None]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find common customers
common_customers = list(train_customers.intersection(test_customers))

In [None]:
print(common_customers)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Recommended items
train_purchases = get_customer_purchases(train_data, '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3')
test_purchases = get_customer_purchases(test_data, '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3')
similarity_scores = calc_avg_similarity_score('981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3',train_purchases, articles, train_data, similarity_matrix)
recommendations = get_top_recommendations(train_purchases,similarity_scores,train_data,articles,n_recommendations=12)
rec_items = recommendations['article_id']

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312003 T-shirt
880312005 T-shirt
880238002 T-shirt
880238006 T-shirt
880238005 T-shirt
880238003 T-shirt
516614001 T-shirt
516614002 T-shirt
751387001 Sweater
679505001 Vest top
753906003 T-shirt
753906004 T-shirt


In [None]:
#purchases made during training period
pur_items = train_data[train_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312004 T-shirt


In [None]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

547780040 Trousers
853589004 T-shirt
818754007 Top
806388018 T-shirt
855239003 Sweater
806388018 T-shirt
759814020 Trousers


#### Get recommendations for new customer

In [None]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find customers that are in test but not in train
new_customers = list(test_customers - train_customers)

In [None]:
print(new_customers)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Recommended items
train_purchases = get_customer_purchases(train_data, '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf')
test_purchases = get_customer_purchases(test_data,'4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf')
similarity_scores = calc_avg_similarity_score('4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf',train_purchases, articles, train_data, similarity_matrix)
recommendations = get_top_recommendations(train_purchases,similarity_scores,train_data,articles,n_recommendations=12)
rec_items = recommendations['article_id']

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

706016001 Trousers
706016002 Trousers
372860001 Socks
610776002 T-shirt
759871002 Vest top
464297007 Underwear bottom
372860002 Socks
610776001 T-shirt
399223001 Trousers
706016003 Trousers
720125001 Leggings/Tights
156231001 Underwear Tights


In [None]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

673677027 Sweater


#### Evaluate on test set

In [None]:
customers = test_data['customer_id'].unique()
batch_size=5000
# Create batches
n_batches = len(customers) // batch_size + (1 if len(customers) % batch_size != 0 else 0)
customer_batches = np.array_split(customers, n_batches)

all_map_scores = []

In [None]:
for batch in tqdm(customer_batches, desc="Processing customer batches"):
    batch_map_scores = []
    for customer in batch:
        # Get training purchases for recommendations
        train_purchases = get_customer_purchases(train_data, customer)
        # Get test purchases for evaluation (ground truth)
        test_purchases = get_customer_purchases(test_data, customer)
        # Calculate similarity scores using training data
        similarity_scores = calc_avg_similarity_score(customer,train_purchases, articles, train_data, similarity_matrix)
        # Get recommendations based on training data
        recommendations = get_top_recommendations(train_purchases,similarity_scores,train_data,articles,n_recommendations=12)

        recommended_items = recommendations['article_id']

        # Evaluate recommendations against test purchases
        map_score = evaluate_recommendations(test_purchases, recommended_items)
        batch_map_scores.append(map_score)

    # Extend the main list with batch results
    all_map_scores.extend(batch_map_scores)

    # Print intermediate results for monitoring
    batch_average = sum(batch_map_scores) / len(batch_map_scores) if batch_map_scores else 0
    print(f"Batch average MAP@12: {batch_average:.4f}")


Processing customer batches:   7%|▋         | 1/14 [1:01:09<13:15:03, 3669.48s/it]

Batch average MAP@12: 0.0063


Processing customer batches:  14%|█▍        | 2/14 [1:59:57<11:57:14, 3586.22s/it]

Batch average MAP@12: 0.0050


Processing customer batches:  21%|██▏       | 3/14 [3:02:02<11:09:06, 3649.69s/it]

Batch average MAP@12: 0.0064


Processing customer batches:  29%|██▊       | 4/14 [4:02:58<10:08:39, 3651.98s/it]

Batch average MAP@12: 0.0051


Processing customer batches:  36%|███▌      | 5/14 [5:07:50<9:20:48, 3738.72s/it] 

Batch average MAP@12: 0.0042


Processing customer batches:  43%|████▎     | 6/14 [6:11:31<8:22:13, 3766.71s/it]

Batch average MAP@12: 0.0046


Processing customer batches:  50%|█████     | 7/14 [7:17:34<7:26:55, 3830.74s/it]

Batch average MAP@12: 0.0055


Processing customer batches:  57%|█████▋    | 8/14 [8:26:07<6:32:03, 3920.56s/it]

Batch average MAP@12: 0.0041


Processing customer batches:  64%|██████▍   | 9/14 [9:34:34<5:31:34, 3978.99s/it]

Batch average MAP@12: 0.0053


Processing customer batches:  71%|███████▏  | 10/14 [10:45:06<4:30:28, 4057.21s/it]

Batch average MAP@12: 0.0038


Processing customer batches:  79%|███████▊  | 11/14 [11:57:37<3:27:21, 4147.07s/it]

Batch average MAP@12: 0.0050


Processing customer batches:  86%|████████▌ | 12/14 [13:08:50<2:19:30, 4185.15s/it]

Batch average MAP@12: 0.0037


Processing customer batches:  93%|█████████▎| 13/14 [14:20:09<1:10:13, 4213.73s/it]

Batch average MAP@12: 0.0051


Processing customer batches: 100%|██████████| 14/14 [15:32:34<00:00, 3996.74s/it]

Batch average MAP@12: 0.0036





In [None]:
# Calculate final average MAP@12
average_map12 = sum(all_map_scores) / len(all_map_scores) if all_map_scores else 0
print(f"\nAverage MAP@12 score across test set: {average_map12:.4f}")


Average MAP@12 score across test set: 0.0048
