In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [2]:
transactions = pd.read_excel('InventoryAndSale_snapshot_data\Sales_snapshot_data\TT T01-2022_split_1.xlsx', dtype={'product_id':str}) # @param {type:"string"}
transactions.drop(['channel_id', 'cost_price', 'net_price'], inplace=True, axis=1)

In [3]:
transactions.info()
transactions.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81893 entries, 0 to 81892
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   month                      81893 non-null  int64 
 1   week                       81893 non-null  int64 
 2   site                       81893 non-null  int64 
 3   branch_id                  81893 non-null  int64 
 4   distribution_channel       81893 non-null  object
 5   distribution_channel_code  81893 non-null  object
 6   sold_quantity              81893 non-null  int64 
 7   customer_id                81893 non-null  object
 8   product_id                 81893 non-null  object
dtypes: int64(5), object(4)
memory usage: 5.6+ MB


Unnamed: 0,month,week,site,branch_id,distribution_channel,distribution_channel_code,sold_quantity,customer_id,product_id
0,2022001,202201,1800,1800,Online,ZF2,1,9847d4248,d77fdd34a14845db97837e059b0aca00TRG42
1,2022001,202204,1116,1100,Bán lẻ,FP,1,2384aef55,e485c0ab7b9b470cbddb80ea7367e734DEN40
2,2022001,202201,1134,1100,Bán lẻ,FP,1,20c3e0442,ac88f78262ee4b589bc93b106b67af1dDEN42
3,2022001,202204,1612,1600,Bán lẻ,FP,1,e8b42ff8f,920641c624934c4a8695347737f8f59dDEN35
4,2022001,202202,1511,1500,Bán lẻ,FP,1,b8d51499a,6764565f4bb141138af7d9cbf0905d0dHOL33


In [5]:
start_week = 202201	
# Filter transactions by date

transactions = transactions.loc[transactions["week"] >= start_week]

# Filter transactions by number of an article has been bought
products_bought_count = transactions[['product_id', 'week']].groupby('product_id').count().reset_index().rename(columns={'week': 'count'})
most_bought_products = products_bought_count[products_bought_count['count']>10]['product_id'].values
transactions = transactions[transactions['product_id'].isin(most_bought_products)]

In [6]:
np.random.seed(0)

negative_samples = pd.DataFrame({
    'product_id': np.random.choice(transactions.product_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'sold_quantity': np.zeros(transactions.shape[0])
})

In [13]:
from sklearn.metrics.pairwise import cosine_similarity


class ItemBased_RecSys:
    ''' Collaborative filtering using a custom sim(u,u'). '''

    def __init__(self, positive_transactions, negative_transactions, num_components=10):
        ''' Constructor '''
        self.positive_transactions = positive_transactions
        self.transactions = pd.concat([positive_transactions, negative_transactions])
        self.customers = self.transactions.customer_id.values
        self.articles = self.transactions.product_id.values  # Change 'article_id' to 'product_id'
        self.sold_quantity = self.transactions.sold_quantity.values  # Change 'bought' to 'sold_quantity'
        self.num_components = num_components

        self.customer_id2index = {c: i for i, c in enumerate(np.unique(self.customers))}
        self.article_id2index = {a: i for i, a in enumerate(np.unique(self.articles))}

    def __sdg__(self):
        for idx in tqdm(self.training_indices):
            # Get the current sample
            customer_id = self.customers[idx]
            article_id = self.articles[idx]
            sold_quantity = self.sold_quantity[idx]  # Change 'bought' to 'sold_quantity'

            # Get the index of the user and the article
            customer_index = self.customer_id2index[customer_id]
            article_index = self.article_id2index[article_id]

            # Compute the prediction and the error
            prediction = self.predict_single(customer_index, article_index)
            error = (sold_quantity - prediction)  # Change 'bought' to 'sold_quantity'

            # Update latent factors in terms of the learning rate and the observed error
            self.customers_latent_matrix[customer_index] += self.learning_rate * \
                                                            (error * self.articles_latent_matrix[article_index] - \
                                                             self.lmbda * self.customers_latent_matrix[customer_index])
            self.articles_latent_matrix[article_index] += self.learning_rate * \
                                                           (error * self.customers_latent_matrix[customer_index] - \
                                                            self.lmbda * self.articles_latent_matrix[article_index])

    def fit(self, n_epochs=10, learning_rate=0.001, lmbda=0.1):
        ''' Compute the matrix factorization R = P x Q '''
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        n_samples = self.transactions.shape[0]

        # Initialize latent matrices
        self.customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.customers)),
                                                                       self.num_components))
        self.articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.articles)),
                                                                      self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            self.training_indices = np.arange(n_samples)

            # Shuffle training samples and follow stochastic gradient descent
            np.random.shuffle(self.training_indices)
            self.__sdg__()

    def predict_single(self, customer_index, article_index):
        ''' Make a prediction for a specific user and article '''
        prediction = np.dot(self.customers_latent_matrix[customer_index],
                           self.articles_latent_matrix[article_index])
        prediction = np.clip(prediction, 0, 1)

        return prediction

    def default_recommendation(self):
        ''' Calculate time decaying popularity '''
        # Calculate time decaying popularity. This leads to items bought more recently having more weight in the popularity list.
        # In simple words, item A bought 5 times on the first day of the train period is inferior than item B bought 4 times on the last day of the train period.
        self.positive_transactions['pop_factor'] = self.positive_transactions['week'].apply(lambda x: 202201 - x)
        transactions_by_article = self.positive_transactions[['product_id', 'pop_factor']].groupby(
            'product_id').sum().reset_index()  # Change 'article_id' to 'product_id'
        return transactions_by_article.sort_values(by='pop_factor', ascending=False)[
            'product_id'].values[:12]  # Change 'article_id' to 'product_id'

    def predict(self, customers):
        ''' Make recommendations '''
        recommendations = []

        # Compute similarity matrix (cosine)
        similarity_matrix = cosine_similarity(self.articles_latent_matrix, self.articles_latent_matrix,
                                              dense_output=False)

        # Convert similarity matrix into a matrix containing the 12 most similar items' index for each item
        similarity_matrix = np.argsort(similarity_matrix, axis=1)
        similarity_matrix = similarity_matrix[:, -12:]

        # Get default recommendation (time decay popularity)
        default_recommendation = self.default_recommendation()

        # Group articles by user and articles to compute the number of times each article has been bought by each user
        transactions_by_customer = self.positive_transactions[['customer_id', 'product_id', 'sold_quantity']].groupby(
            ['customer_id', 'product_id']).count().reset_index()  # Change 'article_id' to 'product_id' and 'bought' to 'sold_quantity'
        most_bought_article = transactions_by_customer.loc[
            transactions_by_customer.groupby('customer_id').sold_quantity.idxmax()][
            'product_id'].values  # Change 'article_id' to 'product_id' and 'bought' to 'sold_quantity'

        # Make predictions
        for customer in tqdm(customers):
            try:
                rec_aux1 = []
                rec_aux2 = []
                aux = []

                # Retrieve the most bought article by customer
                user_most_bought_article_id = most_bought_article[self.customer_id2index[customer]]

                # Using the similarity matrix, get the 6 most similar articles
                rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]
                # Return the half of the default recommendation
                rec_aux2 = default_recommendation

                # Merge half of both recommendation lists
                for rec_idx in range(6):
                    aux.append(rec_aux2[rec_idx])
                    aux.append(rec_aux1[rec_idx])

                recommendations.append(' '.join(aux))
            except:
                # Return the default recommendation
                recommendations.append(' '.join(default_recommendation))

        return pd.DataFrame({
            'customer_id': customers,
            'prediction': recommendations,
        })


In [14]:
rec = ItemBased_RecSys(transactions, negative_samples, num_components=1000)
rec.fit(n_epochs=20)

Epoch: 0


0it [00:00, ?it/s]


Epoch: 1


0it [00:00, ?it/s]


Epoch: 2


0it [00:00, ?it/s]


Epoch: 3


0it [00:00, ?it/s]


Epoch: 4


0it [00:00, ?it/s]


Epoch: 5


0it [00:00, ?it/s]


Epoch: 6


0it [00:00, ?it/s]


Epoch: 7


0it [00:00, ?it/s]


Epoch: 8


0it [00:00, ?it/s]


Epoch: 9


0it [00:00, ?it/s]


Epoch: 10


0it [00:00, ?it/s]


Epoch: 11


0it [00:00, ?it/s]


Epoch: 12


0it [00:00, ?it/s]


Epoch: 13


0it [00:00, ?it/s]


Epoch: 14


0it [00:00, ?it/s]


Epoch: 15


0it [00:00, ?it/s]


Epoch: 16


0it [00:00, ?it/s]


Epoch: 17


0it [00:00, ?it/s]


Epoch: 18


0it [00:00, ?it/s]


Epoch: 19


0it [00:00, ?it/s]


In [15]:
customers = pd.read_excel('sales_and_inventory_mentor_data\MasterData\Distribution Channel.xlsx').customer_id.unique() # @param {type:"string"}

In [16]:
recommendations = rec.predict(customers)

ValueError: Found array with 0 sample(s) (shape=(0, 1000)) while a minimum of 1 is required by check_pairwise_arrays.