In [None]:
# Importing Libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# change current directory
os.chdir("/content/drive/MyDrive/H&M_Personalized_Recommendations")

In [None]:
# Load the articles data
articles = pd.read_csv('data/articles.csv')

In [None]:
# Load the transactions data
transactions = pd.read_csv('data/transactions_train.csv')

In [None]:
# Load the customers data
customers = pd.read_csv('data/customers.csv')

## Content Based Filtering using TF-IDF & KNN

### Helper Functions

In [None]:
# Clean and preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    return ''

In [None]:
# Create feature vectors from article content
def create_content_features(articles_df):

    # Preprocess descriptions
    processed_desc = articles_df['detail_desc'].apply(preprocess_text)

    # Create TF-IDF vectors
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    content_features = tfidf.fit_transform(processed_desc)


    return content_features

In [None]:
def get_customer_purchases(transactions_df,customer_id):
    # Get customer's purchase history
    customer_purchases = transactions_df[
        transactions_df['customer_id'] == customer_id
    ]['article_id'].unique()

    return customer_purchases

In [None]:
def get_popular_items(transactions_df,articles_df,n_recommendations=12):
        # For new customers, recommend popular items
        popular_items = transactions_df['article_id'].value_counts().reset_index()
        popular_items.columns = ['article_id', 'purchase_count']

        # Merge first
        recommendations = popular_items.merge(
            articles_df[['article_id', 'prod_name', 'product_type_name']],
            on='article_id'
        ).head(n_recommendations).copy()  # Create explicit copy

        # Calculate similarity score using loc
        max_purchases = recommendations['purchase_count'].max()
        recommendations.loc[:, 'similarity_score'] = recommendations['purchase_count'] / max_purchases

        # Return only needed columns
        return recommendations[['article_id', 'prod_name', 'product_type_name', 'similarity_score']].copy()

In [None]:
def fit_knn_model(articles_df, n_neighbors=7):

    content_features = create_content_features(articles_df)
    # Fit KNN on content features
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
    knn.fit(content_features)

    return content_features, knn

In [None]:
# Get recommendations using KNN based on content similarity or popularity
def get_knn_recommendations(customer_purchases, popular_items, articles_df,content_features, knn, n_recommendations=12):

    if len(customer_purchases) == 0:

      return popular_items


    similar_items = []
    similarity_scores = []

    # For each purchased item, find similar items based on content
    for article_id in customer_purchases:
        article_idx = articles_df[articles_df['article_id'] == article_id].index
        if len(article_idx) > 0:
            # Get content feature vector for this article
            article_features = content_features[article_idx[0]]

            # Find nearest neighbors based on content similarity
            distances, indices = knn.kneighbors(article_features)

            similar_items.extend(articles_df.iloc[indices[0]]['article_id'].tolist())
            similarity_scores.extend(1 - distances[0])

    # Create recommendations DataFrame
    recommendations = pd.DataFrame({
        'article_id': similar_items,
        'similarity_score': similarity_scores
    })

    # Group and sort recommendations
    recommendations = (recommendations.groupby('article_id')['similarity_score']
                      .max()
                      .reset_index()
                      .sort_values('similarity_score', ascending=False))

    # Filter out purchased items
    recommendations = recommendations[~recommendations['article_id'].isin(customer_purchases)]
    recommendations = recommendations.head(n_recommendations)

    # Final merge with article details
    final_recommendations = recommendations.merge(
        articles_df[['article_id', 'prod_name', 'product_type_name']],
        on='article_id'
    ).copy()  # Create explicit copy

    return final_recommendations

In [None]:
# Using MAP@12 for evaluation
def evaluate_recommendations(true_purchases, recommendations):
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(recommendations, 1):
        if p in true_purchases and p not in recommendations[:i-1]:
            num_hits += 1.0
            score += num_hits/i

    return score / min(len(true_purchases),12)

### Implement

In [None]:
train_start_date = '2020-08-11'
train_end_date = '2020-09-15'
train_data = transactions[(transactions['t_dat'] > train_start_date) & (transactions['t_dat'] <= train_end_date)]
test_data = transactions[transactions['t_dat'] > train_end_date]

In [None]:
popular_items = get_popular_items(train_data, articles)

In [None]:
content_features, knn = fit_knn_model(articles)

#### Get recommendations for existing customer

In [None]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find common customers
common_customers = list(train_customers.intersection(test_customers))

In [None]:
print(common_customers)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Recommended items
train_purchases = get_customer_purchases(train_data, '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3')
recommendations = get_knn_recommendations(train_purchases, popular_items, articles,content_features, knn)
rec_items = recommendations['article_id']

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312003 T-shirt
880312005 T-shirt
880238002 T-shirt
880238003 T-shirt
880238005 T-shirt
880238006 T-shirt


In [None]:
#purchases made during training period
pur_items = train_data[train_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312004 T-shirt


In [None]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

547780040 Trousers
853589004 T-shirt
818754007 Top
806388018 T-shirt
855239003 Sweater
806388018 T-shirt
759814020 Trousers


#### Get recommendations for new customer

In [None]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find customers that are in test but not in train
new_customers = list(test_customers - train_customers)

In [None]:
# Recommended items
train_purchases = get_customer_purchases(train_data, '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf')
recommendations = get_knn_recommendations(train_purchases, popular_items, articles,content_features, knn)
rec_items = recommendations['article_id']

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

751471001 Trousers
706016001 Trousers
918292001 Leggings/Tights
916468003 Cardigan
915526001 Sweater
751471043 Trousers
896152002 T-shirt
915529003 Sweater
448509014 Trousers
863595006 Cardigan
915526002 Sweater
850917001 Shirt


In [None]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

673677027 Sweater


#### Evaluate on test set

In [None]:
customers = test_data['customer_id'].unique()
batch_size=5000
# Create batches
n_batches = len(customers) // batch_size + (1 if len(customers) % batch_size != 0 else 0)
customer_batches = np.array_split(customers, n_batches)

all_map_scores = []

In [None]:
for batch in tqdm(customer_batches, desc="Processing customer batches"):
    batch_map_scores = []
    for customer in batch:
        # Get training purchases for recommendations
        train_purchases = get_customer_purchases(train_data, customer)
        # Get test purchases for evaluation (ground truth)
        test_purchases = get_customer_purchases(test_data, customer)
        # Get recommendations based on training data
        recommendations = get_knn_recommendations(train_purchases, popular_items, articles,content_features, knn)

        recommended_items = recommendations['article_id']

        # Evaluate recommendations against test purchases
        map_score = evaluate_recommendations(test_purchases, recommended_items)
        batch_map_scores.append(map_score)

    # Extend the main list with batch results
    all_map_scores.extend(batch_map_scores)

    # Print intermediate results for monitoring
    batch_average = sum(batch_map_scores) / len(batch_map_scores) if batch_map_scores else 0
    print(f"Batch average MAP@12: {batch_average:.4f}")


Processing customer batches:   7%|▋         | 1/14 [14:21<3:06:45, 861.97s/it]

Batch average MAP@12: 0.0078


Processing customer batches:  14%|█▍        | 2/14 [28:54<2:53:38, 868.23s/it]

Batch average MAP@12: 0.0066


Processing customer batches:  21%|██▏       | 3/14 [43:03<2:37:34, 859.50s/it]

Batch average MAP@12: 0.0076


Processing customer batches:  29%|██▊       | 4/14 [57:01<2:21:49, 850.97s/it]

Batch average MAP@12: 0.0073


Processing customer batches:  36%|███▌      | 5/14 [1:10:29<2:05:20, 835.59s/it]

Batch average MAP@12: 0.0067


Processing customer batches:  43%|████▎     | 6/14 [1:23:54<1:49:59, 824.90s/it]

Batch average MAP@12: 0.0071


Processing customer batches:  50%|█████     | 7/14 [1:36:57<1:34:39, 811.40s/it]

Batch average MAP@12: 0.0061


Processing customer batches:  57%|█████▋    | 8/14 [1:49:35<1:19:25, 794.30s/it]

Batch average MAP@12: 0.0051


Processing customer batches:  64%|██████▍   | 9/14 [2:02:19<1:05:24, 784.88s/it]

Batch average MAP@12: 0.0067


Processing customer batches:  71%|███████▏  | 10/14 [2:14:57<51:46, 776.70s/it] 

Batch average MAP@12: 0.0051


Processing customer batches:  79%|███████▊  | 11/14 [2:27:13<38:12, 764.16s/it]

Batch average MAP@12: 0.0063


Processing customer batches:  86%|████████▌ | 12/14 [2:39:29<25:11, 755.65s/it]

Batch average MAP@12: 0.0061


Processing customer batches:  93%|█████████▎| 13/14 [2:51:44<12:29, 749.34s/it]

Batch average MAP@12: 0.0070


Processing customer batches: 100%|██████████| 14/14 [3:03:55<00:00, 788.25s/it]

Batch average MAP@12: 0.0055





In [None]:
# Calculate final average MAP@12
average_map12 = sum(all_map_scores) / len(all_map_scores) if all_map_scores else 0
print(f"\nAverage MAP@12 score across test set: {average_map12:.4f}")


Average MAP@12 score across test set: 0.0065
