In [33]:
!pip install implicit



In [34]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import pandas as pd
import numpy as np
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import time
from tqdm import tqdm

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# change current directory
os.chdir("/content/drive/MyDrive/H&M_Personalized_Recommendations")

In [37]:
# Load the articles data
articles = pd.read_csv('data/articles.csv')

In [38]:
# Load the customers data
customers = pd.read_csv('data/customers.csv')

In [39]:
# Load the transactions data
transactions = pd.read_csv('data/transactions_train.csv')

## Implicit Alternating Least Squares

### Helper Functions

In [40]:
def create_csr_matrix(customers_df, articles_df, transactions_df):

    # Get unique users and items
    Users = customers_df.customer_id.unique().tolist()
    Items = articles_df.article_id.unique().tolist()

    # Create mappings
    user_ids = dict(list(enumerate(Users)))  # index to ID
    item_ids = dict(list(enumerate(Items)))  # index to ID

    # Create reverse mappings (ID to index)
    user_map = {uid: idx for idx, uid in user_ids.items()}
    item_map = {iid: idx for idx, iid in item_ids.items()}

    # Create matrix coordinates and data
    row = np.array([user_map[uid] for uid in transactions_df['customer_id']])
    col = np.array([item_map[iid] for iid in transactions_df['article_id']])
    data = np.ones(len(transactions_df))

    # Create sparse matrix
    csr = csr_matrix(
        (data, (row, col)),
        shape=(len(user_map), len(item_map))
    )


    return csr, user_map, item_map


In [41]:
def train_als_model(customers_df, articles_df, transactions_df):
    # Create interaction matrix
    csr, user_map, item_map = create_csr_matrix(
        customers_df,
        articles_df,
        transactions_df
    )

    # Initialize and train model
    model = AlternatingLeastSquares(
        factors=800,
        iterations=30,
        regularization=0.005,
        alpha=60
    )

    model.fit(csr)

    return model, csr, user_map, item_map

In [42]:
def get_recommendations(model, csr, user_id, user_map, item_map, n_items=12):
    # Convert user_id to matrix index
    user_idx = user_map[user_id]

    # Get recommendations
    item_indices, scores = model.recommend(
        user_idx,
        csr[user_idx],
        N=n_items,

    )

    # Create reverse mapping of item_map
    item_ids = {idx: item_id for item_id, idx in item_map.items()}

    # Convert indices to article_ids
    recommended_articles = [item_ids[idx] for idx in item_indices]

    return recommended_articles

In [43]:
def evaluate_map12(true_items, recommended_items):
    """
    Calculate MAP@12
    """
    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(recommended_items, 1):
        if p in true_items and p not in recommended_items[:i-1]:
            num_hits += 1.0
            score += num_hits / i

    return score / min(len(true_items), 12)

### Implement

In [44]:
train_start_date = '2020-08-11'
train_end_date = '2020-09-15'
train_data = transactions[(transactions['t_dat'] > train_start_date) &
        (transactions['t_dat'] <= train_end_date)
    ]
test_data = transactions[transactions['t_dat'] > train_end_date]

In [45]:
model, csr, user_map, item_map = train_als_model(customers, articles, train_data)

  0%|          | 0/30 [00:00<?, ?it/s]

#### Get recommendations for existing customer

In [46]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find common customers
common_customers = list(train_customers.intersection(test_customers))

In [47]:
print(common_customers)

Output hidden; open in https://colab.research.google.com to view.

In [48]:
# Recommended items
rec_items = get_recommendations(
    model, csr, '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3', user_map, item_map
)

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312003 T-shirt
880312005 T-shirt
881691003 Trousers
916468002 Cardigan
874113004 Sweater
878499013 T-shirt
937915003 Sweater
894668003 Trousers
873217001 Sweater
860334001 T-shirt
880099002 T-shirt
902388002 Top


In [49]:
#purchases made during training period
pur_items = train_data[train_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

880312004 T-shirt


In [50]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '981aa7853b12c9a103d4bd269217ac4bdebd2f48b9b7b5baac6f9d8643d712b3']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

547780040 Trousers
853589004 T-shirt
818754007 Top
806388018 T-shirt
855239003 Sweater
806388018 T-shirt
759814020 Trousers


#### Get recommendations for new customer

In [51]:
# Get unique customers from both sets
train_customers = set(train_data['customer_id'].unique())
test_customers = set(test_data['customer_id'].unique())

# Find customers that are in test but not in train
new_customers = list(test_customers - train_customers)

In [52]:
print(new_customers)

Output hidden; open in https://colab.research.google.com to view.

In [53]:
rec_items = get_recommendations(
    model, csr, '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf', user_map, item_map
)

for id in rec_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

112679048 Sweater
111609001 Underwear Tights
111593001 Underwear Tights
111586001 Leggings/Tights
111565003 Socks
111565001 Underwear Tights
110065011 Bra
110065002 Bra
110065001 Bra
108775051 Vest top
108775044 Vest top
108775015 Vest top


In [54]:
#actual purchases in test period
pur_items = test_data[test_data['customer_id'] == '4e86b6c5680e507e614cddc7ded07e8a5fc41aa7c725a2852c9590d3df1834bf']['article_id'].tolist()

for id in pur_items:
    print(id, articles[articles['article_id'] == id]['product_type_name'].values[0])

673677027 Sweater


#### Evaluate on Test set

In [55]:
customers = test_data['customer_id'].unique()
batch_size=10000
# Create batches
n_batches = len(customers) // batch_size + (1 if len(customers) % batch_size != 0 else 0)
customer_batches = np.array_split(customers, n_batches)

all_map_scores = []

In [56]:
for batch in tqdm(customer_batches, desc="Processing customer batches"):
        batch_map_scores = []

        for customer in batch:
            # Get recommendations
            rec_articles = get_recommendations(
                model, csr, customer, user_map, item_map
            )

            if len(rec_articles) > 0:  # if we got recommendations
                # Get actual purchases
                true_items = test_data[test_data['customer_id'] == customer]['article_id'].tolist()

                # Calculate MAP@12
                if true_items:  # if customer made purchases in test period
                    map_score = evaluate_map12(true_items, rec_articles)
                    batch_map_scores.append(map_score)

        # Extend the main list with batch results
        all_map_scores.extend(batch_map_scores)

        # Print intermediate results for monitoring
        batch_average = sum(batch_map_scores) / len(batch_map_scores) if batch_map_scores else 0
        print(f"Batch average MAP@12: {batch_average:.4f}")

Processing customer batches:  14%|█▍        | 1/7 [07:12<43:17, 432.88s/it]

Batch average MAP@12: 0.0089


Processing customer batches:  29%|██▊       | 2/7 [14:19<35:45, 429.18s/it]

Batch average MAP@12: 0.0092


Processing customer batches:  43%|████▎     | 3/7 [21:32<28:44, 431.14s/it]

Batch average MAP@12: 0.0077


Processing customer batches:  57%|█████▋    | 4/7 [28:40<21:29, 429.74s/it]

Batch average MAP@12: 0.0069


Processing customer batches:  71%|███████▏  | 5/7 [35:47<14:17, 428.84s/it]

Batch average MAP@12: 0.0065


Processing customer batches:  86%|████████▌ | 6/7 [42:46<07:05, 425.42s/it]

Batch average MAP@12: 0.0057


Processing customer batches: 100%|██████████| 7/7 [49:57<00:00, 428.26s/it]

Batch average MAP@12: 0.0054





In [57]:
# Calculate final average MAP@12
average_map12 = sum(all_map_scores) / len(all_map_scores) if all_map_scores else 0
print(f"\nAverage MAP@12 score across test set: {average_map12:.4f}")


Average MAP@12 score across test set: 0.0072
