### Importing the libraries

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix # Compressed Sparse Row matrix which is used to save memory by only storing non-zero elements
from sklearn.decomposition import TruncatedSVD # dimensionality reduction technique used to reduce the number of features in the data.
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

### Loading the data

In [3]:
data = pd.read_csv('final.csv')

In [4]:
data.head()

Unnamed: 0,Date,Customer_Name,Item_Name,Vrh_No,Quantity,Price_per_unit,Amount
0,2019-01-04,customer1,Item_1,1,200,20,4000
1,2019-01-04,customer7,Item_3,10,15,33,495
2,2019-01-04,customer7,Item_1,10,50,20,1000
3,2019-01-04,customer8,Item_11,11,80,30,2400
4,2019-01-04,customer8,Item_7,11,624,21,13416


### Creating a pivot table to represent the user-item matrix

In [5]:
# The resulting matrix represents the quantity of items bought by each customer. Any missing or NaN values are replaced with 0.
user_item_matrix = data.pivot_table(index='Customer_Name', columns='Item_Name', values='Quantity', fill_value=0)
user_item_matrix

Item_Name,Item_1,Item_10,Item_11,Item_12,Item_13,Item_14,Item_15,Item_16,Item_17,Item_18,...,Item_70,Item_71,Item_72,Item_73,Item_74,Item_75,Item_76,Item_77,Item_8,Item_9
Customer_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
customer1,125.0,0.0,0,0,0,0,0.0,0,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer10,850.0,0.0,0,0,30,19,12.0,44,24,41.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer100,0.0,0.0,0,0,0,0,0.0,12,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer101,120.0,312.5,0,0,0,0,0.0,0,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer102,0.0,0.0,0,0,0,0,0.0,0,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
customer95,0.0,0.0,0,0,0,0,0.0,0,0,163.666667,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer96,0.0,0.0,0,0,0,0,0.0,24,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer97,540.0,0.0,0,0,0,0,0.0,0,0,0.000000,...,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0
customer98,0.0,75.0,0,0,0,0,0.0,0,0,62.750000,...,0,0,0,0,0.0,0.0,0,15.0,0.0,50.0


### Converting the user-item matrix to a sparse matrix

In [6]:
# Convert the user-item matrix to a sparse matrix using the csr_matrix() function from the Scipy library.
# The sparse matrix is used to store only non-zero elements, which helps save memory space.
sparse_matrix = csr_matrix(user_item_matrix.values)

### Split the data into training and testing sets

In [7]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=10)

### Perform matrix factorization using SVD on the training data

In [8]:
#The most common method for recommendation systems often comes with Collaborating Filtering (CF) where 
#it relies on the past user and item dataset. Two popular approaches of CF are latent factor models,
#which extract features from user and item matrices and neighborhood models, which finds similarities between products or users.
#The neighborhood model is an item-oriented approach to discover the user preference based on the ratings given by the user for similar items.
#On the other hand, latent factor models such as Singular Value Decomposition (SVD) extract features and correlation from the user-item matrix. For example, when items are movies in different categories. SVD would generate factors when looking into the dimension space like action vs comedy, Hollywood vs Bollywood, or Marvel vs Disney. Mainly, we will focus on the latent factor model for the Singular Value Decomposition (SVD) approach

In [9]:
# The code first determines the number of components to be used in the TruncatedSVD function by taking the minimum value of the length of user_item_matrix columns and user_item_matrix indices, subtracting 1.
# The TruncatedSVD function is then used to model the user-item matrix with the specified number of components.
# The resulting user_factors matrix represents the user embeddings, 
# while the item_factors matrix represents the item embeddings.

n_components = min(len(user_item_matrix.columns), len(user_item_matrix.index)) - 1
model = TruncatedSVD(n_components=n_components, random_state=42)
user_factors = model.fit_transform(sparse_matrix)
item_factors = model.components_

In [10]:
user_factors

array([[ 1.33737314e+02, -6.17330217e+01,  1.23692037e+02, ...,
        -4.75869639e-17, -3.93692897e-17,  1.93730856e-18],
       [ 6.07868494e+02,  8.00407666e+01,  1.15451478e+03, ...,
         2.91635788e-16, -5.08071221e-16, -4.81950720e-16],
       [ 6.85405595e+01, -1.64948361e+01,  6.22156855e+01, ...,
        -3.10852665e-17, -3.38817378e-17, -1.40285780e-17],
       ...,
       [ 9.14633703e+02, -3.23579331e+02,  1.12583846e+03, ...,
         4.19005533e-16,  9.47852346e-16, -1.29420507e-16],
       [ 2.54084830e+03, -1.57595914e+03, -4.78457510e+02, ...,
        -1.11717364e-16, -1.73311463e-16,  1.29895348e-18],
       [ 3.93806729e+01,  6.37548770e+00,  6.37920122e+01, ...,
         5.40318030e-16,  4.51981168e-16,  1.81042457e-16]])

In [11]:
item_factors

array([[ 1.58039899e-01,  3.50701150e-02,  6.92332743e-03, ...,
         4.87182311e-03,  6.15192070e-03,  1.53159850e-01],
       [ 3.05576417e-03,  2.11123229e-02,  4.54542580e-03, ...,
         3.32205663e-03, -8.86128707e-04,  1.04481620e-01],
       [ 3.63533486e-01,  3.39108880e-02, -2.97301547e-04, ...,
         1.42762689e-02, -1.06982735e-04,  5.17687963e-01],
       ...,
       [ 0.00000000e+00,  1.19077941e-19,  4.36743581e-19, ...,
         1.42109225e-18,  2.47342038e-18, -3.84630553e-20],
       [ 0.00000000e+00, -3.46806792e-19,  2.87277528e-18, ...,
        -2.99580344e-19, -2.57157739e-18, -4.55310044e-20],
       [ 0.00000000e+00, -1.89205056e-19,  8.11185964e-19, ...,
        -6.83159361e-20,  3.98209699e-19, -2.14250206e-19]])

### Function to recommend items based on the customer's purchase history

In [28]:
# Here is how the function works:

#The index of the customer in the user_item_matrix is determined using the get_loc method of the index object.
#The customer's ratings are retrieved from the user_factors matrix by accessing the row corresponding to the customer index using indexing.
#The item scores are calculated by taking the dot product of the customer ratings and the item_factors matrix.
#The indices of the n items with the highest scores are selected using argsort() and [::-1] to sort in descending order.
#The names of the top n items are looked up from the user_item_matrix columns using the top item indices.
#The function returns the names of the recommended items as an output.
n=5
def recommend_items(customer_name, n=n):
    customer_index = user_item_matrix.index.get_loc(customer_name)
    customer_ratings = user_factors[customer_index, :]
    item_scores = np.dot(customer_ratings, item_factors)
    top_item_indices = item_scores.argsort()[::-1][:n]
    top_items = user_item_matrix.columns[top_item_indices]
    return top_items

In [29]:
customer_ratings= user_factors[86, :]
item_scores = np.dot(customer_ratings, item_factors)
top_item_indices = item_scores.argsort()[::-1][:n]
print(top_item_indices)

[76  0 22 10 11]


### Example usage

In [30]:
customer_name = 'customer5'
recommended_items = recommend_items(customer_name)
print("Recommended items for", customer_name + ":", recommended_items)

Recommended items for customer5: Index(['Item_9', 'Item_1', 'Item_3', 'Item_19', 'Item_2'], dtype='object', name='Item_Name')


### Calculating training accuracy

In [10]:
train_accuracies = []
train_customer_names = train_data['Customer_Name'].unique()

for customer_name in train_customer_names:
    customer_data = train_data[train_data['Customer_Name'] == customer_name]
    true_items = customer_data['Item_Name'].unique()
    recommended_items = recommend_items(customer_name, n)

    num_correct = len(set(true_items) & set(recommended_items))
    accuracy = num_correct / min(n, len(true_items))  # Top-N accuracy
    train_accuracies.append(accuracy)

train_average_accuracy = np.mean(train_accuracies)
print("Training Average Top-{} Accuracy: {:.2f}%".format(n, train_average_accuracy * 100))

Training Average Top-5 Accuracy: 95.77%


### Calculating testing accuracy

In [11]:
test_accuracies = []
test_customer_names = test_data['Customer_Name'].unique()

for customer_name in test_customer_names:
    customer_data = test_data[test_data['Customer_Name'] == customer_name]
    true_items = customer_data['Item_Name'].unique()
    recommended_items = recommend_items(customer_name, n)

    num_correct = len(set(true_items) & set(recommended_items))
    accuracy = num_correct / min(n, len(true_items))  # Top-N accuracy
    test_accuracies.append(accuracy)

test_average_accuracy = np.mean(test_accuracies)
print("Testing Average Top-{} Accuracy: {:.2f}%".format(n, test_average_accuracy * 100))

Testing Average Top-5 Accuracy: 84.55%


### Taking the input of customer number from the user to make final recommendations

In [31]:
customer_num = int(input("Enter the customer number: "))
if customer_num>0 and customer_num<=78:
    recommended_items = recommend_items("customer"+str(customer_num))
    print("\nRecommended items for customer"+str(customer_num) + ":\n")
    for i in recommended_items.values:
        print(i)
else:
    print('Customer number should be between 1 and 78')

Enter the customer number: 10

Recommended items for customer10:

Item_2
Item_19
Item_20
Item_1
Item_3
