<a href="https://www.kaggle.com/code/uom23mlmsc229407x/dm-project-collaborative-filtering-229407x?scriptVersionId=152746300" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns
import datetime
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load the data

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

# Collaborative Filtering

**Pre processing**

we only care about customers and the articles which are bought by the customers

In [None]:
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id':str})
transactions.drop(['sales_channel_id', 'price'], inplace=True, axis=1)
transactions['bought'] = 1

Filter older transactions and articles which are not sold over n times

In [None]:
start_date = datetime.datetime(2020,9,1)
# Filter transactions by date
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])
transactions = transactions.loc[transactions["t_dat"] >= start_date]

# Filter transactions by number of an article has been bought
article_bought_count = transactions[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]

In [None]:
# Generate negative samples
np.random.seed(42)

negative_samples = pd.DataFrame({
    'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'bought': np.zeros(transactions.shape[0])
})

**Training customer and article vector representations using the information whether a customer bought a article or not**

Variables

In [None]:
# variables

# vector length to represent article and customers
num_components = 1000

# learning rate
learning_rate = 0.001

# lambda
lmbda = 0.1

# number of epochs
n_epochs = 20

# combine postive and negative transaction to create on data set
transactions = pd.concat([transactions, negative_transactions])
customers = transactions.customer_id.values
articles = transactions.article_id.values
bought = transactions.bought.values


# creates a dictionary. keys = customer id, value: an index/id for each customer
customer_id2index = {c: i for i, c in enumerate(np.unique(customers))}
# creates a dictionary. keys = customer id, value: an index/id for each customer
article_id2index = {a: i for i, a in enumerate(np.unique(articles))}

training_indices = None
customers_latent_matrix = None
articles_latent_matrix = None

n_samples = transactions.shape[0]

# Initialize latent matrices - n_components vector representation for customers and articles
# (n_cust_uniq X n_components) 
customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(customers)), num_components))
# same for articles
articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(articles)), num_components))

Train

In [None]:
for epoch in range(n_epochs):
    print('Epoch: {}'.format(epoch))
    training_indices = np.arange(n_samples)

    # Shuffle training samples and follow stochastic gradient descent
    np.random.shuffle(training_indices)
    # __sdg__()
    
    # tqdm is a progress bar
    for idx in tqdm(training_indices):
        # Get the current sample
        customer_id = customers[idx]
        article_id = articles[idx]
        print(bought)
        print(idx)
        print(type(bought))
        bought_val = bought[idx]

        # Get the index of the user and the article
        customer_index = customer_id2index[customer_id]
        article_index = article_id2index[article_id]

        ## Compute the prediction and the error
        # get the dot product of two vectors
        prediction = np.dot(customers_latent_matrix[customer_index], articles_latent_matrix[article_index])
        # trim/clip the prediction vlaue
        # if prediction < 0: prediction = 0
        # elif prediction > 0 : prediction = 1
        # else prediction = prediction
        prediction = np.clip(prediction, 0, 1)
        error = (bought_val - prediction) # error

        # Update latent factors in terms of the learning rate and the observed error
        # c = c + alpha x (e x a - lamda x c)
        customers_latent_matrix[customer_index] += learning_rate * \
                                (error * articles_latent_matrix[article_index] - \
                                 lmbda * customers_latent_matrix[customer_index])
        # a = a + alpha x (e x c - lamda x a)
        articles_latent_matrix[article_index] += learning_rate * \
                                (error * customers_latent_matrix[customer_index] - \
                                 lmbda * articles_latent_matrix[article_index])

**Predictions**

In [None]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').customer_id.unique()

recommendations = []

# Compute similarity matrix (cosine)
# ouputs a matrix of size: (self.articles_latent_matrix, self.articles_latent_matrix)
# (i, j) th item represents similarity score of ith and jjth item. 1 is good. 0 is bad. 
similarity_matrix = cosine_similarityarticles_latent_matrix,articles_latent_matrix, dense_output=False)

# Convert similarity matrix into a matrix containing the 12 most similar items' index for each item
# out size : (len(articles), 12)
# the has closesst 12 atrcles for each article
# make 12 a variable
similarity_matrix = np.argsort(similarity_matrix, axis=1)
similarity_matrix = similarity_matrix[:, -12:]

# Get default recommendation (time decay popularity)
# Calculate time decaying popularity
# Calculate time decaying popularity. This leads to items bought more recently having more weight in the popularity list.
# In simple words, item A bought 5 times on the first day of the train period is inferior than item B bought 4 times on the last day of the train period.
positive_transactions['pop_factor'] = positive_transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
transactions_by_article = positive_transactions[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()
default_recommendation = transactions_by_article.sort_values(by='pop_factor', ascending=False)['article_id'].values[:12]
# default_recommendation = default_recommendation()

# Group articles by user and articles to compute the number of times each article has been bought by each user
# transactions_by_customer -> all customer, atricle combinations and their counts avialable in positive transactions
transactions_by_customer = positive_transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
# most bought article for each customer
most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values

# Make predictions
for customer in tqdm(customers):
    try:
        rec_aux1 = []
        rec_aux2 = []
        aux = []

        # Retrieve the most bought article by customer
        user_most_bought_article_id = most_bought_article[self.customer_id2index[customer]]

        # Using the similarity matrix, get the 6 most similar articles
        rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]
        # Return the half of the default recommendation
        rec_aux2 =  default_recommendation

        # Merge half of both recommendation lists
        for rec_idx in range(6):
            aux.append(rec_aux2[rec_idx])
            aux.append(rec_aux1[rec_idx])

        recommendations.append(' '.join(aux))
    except:
        # Return the default recommendation
        recommendations.append(' '.join(default_recommendation))

prediction_df =  pd.DataFrame({'customer_id': customers, 'prediction': recommendations})

In [None]:
prediction_df.to_csv('submission.csv', index=False)

In [None]:
prediction_df