# Recommendation using matrix factorization

**Note : Due to the RAM size of the computer(Using M2 MAC BOOK PRO with 16GB of RAM), only transactions from 1000 customers was chosen**

Customers were chosen from the top 3000 total transaction count customers

## Import modules

In [143]:
import random

import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

## Load dataset

In [2]:
articles = pd.read_csv("datasets/articles.csv")
transaction = pd.read_csv("datasets/transactions_train.csv")
customer = pd.read_csv("datasets/customers.csv")

## Choose 1000 customers and filter only transactions from those customers

In [13]:
customer1000 = (
    transaction["customer_id"]
    .value_counts()
    .head(3000)
    .sample(1000, random_state=1234)
    .index
)

In [14]:
transaction_filtered = transaction[transaction["customer_id"].isin(customer1000)]

In [15]:
len(transaction_filtered)

418739

## Create a pivot table of customer_id X article_id

Before creating the pivot table, I've created a rating table of customer to the article by using how many items the customer bought the item from the whole transaction data

This is due to the fact that there is no significant rating information in the datasets and to compensate this, used the fact that if the customer bought the same item many times this could mean that the customer liked the item

And squared the items count each user bought

In [164]:
customer_article_df = (
    transaction_filtered[["customer_id", "article_id"]]
    .value_counts()
    .to_frame()
    .reset_index()
    .pivot_table(
        index="customer_id", columns="article_id", values="count", fill_value=0.0
    )
)

In [165]:
customer_article_df

article_id,108775015,108775044,108775051,110065001,110065002,110065011,111565001,111586001,111593001,111609001,...,946827001,946827002,947060001,947168001,947509001,949198001,949551001,949551002,953763001,956217002
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00357b192b81fc83261a45be87f5f3d59112db7d117513c1e908e6a7021edc35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00754012108569f9c99871720111a2b50aa7b6ebebe2a415914df8b8e5e120ff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
0077922bc342aca5f3ad0a67198ce22779eeb89f9861b9634370a8dc14c18e79,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0083ee250b3845008465de0e938d0ed2ae4f5bfde8b56ee9b59e6619d899e332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00e59bc10e162c83758a8ece0d6536d96fe2c7afdae9d5f97e58e16bd2a32619,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff639548ca0e8864613d7be6f397125a9f7aa5913165f8e9fa2fe94d44b35d4b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ff7313f354e805f6ace690ddaa753d302341dd810a4a59c79f08211bd498cbc6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffc247b933f175b37fccbb4f71c0479d6625e703b36f637be643afc224a8977f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffe6376eb6b854d842e5a7714ea758de127f086a60d67d5cf425ef20361acea1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
customer_article_mat = customer_article_df.to_numpy()

In [167]:
customer_article_mat.shape

(1000, 56415)

## Do a Truncated Singular Value Decomposition

In [168]:
mean_per_user = customer_article_mat.mean(axis=1).reshape(-1, 1)
mat_del_mean = customer_article_mat - mean_per_user

U, sigma, Vt = svds(mat_del_mean, k=100)
sigma = np.diag(sigma)

In [169]:
U.shape

(1000, 100)

In [170]:
sigma.shape

(100, 100)

In [171]:
Vt.shape

(100, 56415)

## Predict the User x Item transaction

In [172]:
predicted_user_trans_mat = U @ sigma @ Vt + mean_per_user

In [173]:
predicted_user_trans_mat.shape

(1000, 56415)

In [174]:
predicted_user_trans_df = pd.DataFrame(
    predicted_user_trans_mat,
    index=customer_article_df.index,
    columns=customer_article_df.columns,
)

## Create a simple recommendation function

Used the prediction dataframe from above

This function recommends items that the user have not bought, but only users from the filtered 1000 users can only be recommended and only the items these users bought will only in the recommendation list

In [200]:
def print_item_info(item_no, detail_output=False):
    columns_to_print = [
        "prod_name",
        "product_type_name",
        "graphical_appearance_name",
        "colour_group_name",
        "section_name",
        "detail_desc",
    ]

    item_info = articles.loc[
        articles["article_id"].eq(item_no), columns_to_print
    ].squeeze()

    if detail_output:
        print(
            f'- "{item_info["prod_name"]}" which is a "{item_info["product_type_name"]}" with a color pattern of "{item_info["graphical_appearance_name"]} {item_info["colour_group_name"]}", it is a "{item_info["section_name"]}" and a detail description is "{item_info["detail_desc"]}"'
        )
    else:
        print(
            f'- {item_info["prod_name"]} : {item_info["graphical_appearance_name"]} {item_info["colour_group_name"]} {item_info["product_type_name"]}'
        )


def recommend_item_to_user(customer_id, detail_output=False):
    assert (
        customer_id in predicted_user_trans_df.index
    ), "Customer id not in prediction DataFrame."

    bought_items = transaction[transaction["customer_id"].eq(customer_id)][
        "article_id"
    ].unique()

    print("<Bought products info>")
    for item in bought_items:
        print_item_info(item, detail_output)

    predicted = predicted_user_trans_df.loc[customer_id]
    # filter out items which the user have not bought
    # order by the larges score and filter top 10
    top10_items = predicted[~predicted.index.isin(bought_items)].sort_values(
        ascending=False
    ).head(10)

    if len(top10_items) == 0:
        print("There is no items to recommend")
        return
    
    print("\n\n<Recommended Items>")
    for item in top10_items.index:
        print_item_info(item, detail_output)

In [201]:
test = recommend_item_to_user(random.choice(predicted_user_trans_df.index))

<Bought products info>
- Shaping Skinny R.W. : Denim Grey Trousers
- Artic : Melange Yellow Sweater
- Honey : Colour blocking Black Leggings/Tights
- DORIS CREW : Sequin Black Sweater
- BASIC HOOD : Melange Light Pink Hoodie
- DORIS CREW : Embroidery Grey Sweater
- Eclipse Hood : Colour blocking Dark Blue Hoodie
- EUCALYPTUS TM 19.99 : Colour blocking Dark Blue Sweater
- HOLLY : Colour blocking Black Leggings/Tights
- Perfect hood : Colour blocking Light Pink Sweater
- HERBAL HOOD TVP : Placement print Light Grey Hoodie
- HERBAL HOOD TVP : Placement print Dark Blue Hoodie
- Milk RW slacks : Mixed solid/pattern Black Trousers
- Greyworm tapered trouser RW : Solid Black Trousers
- SPEED WANDA cropped ls : Solid White Top
- Russel hood : Front print Black Sweater
- SUZY HALF ZIP LOW LOW : Front print Yellow Top
- GWEN ss tee : Solid Yellow T-shirt
- Lolly skirt velour : Other structure Dark Blue Skirt
- LARISSA hood : Solid White Hoodie
- GWEN ss tee : Solid Dark Blue T-shirt
- TUXEDO pan