# Recommendation using content based filtering

**Note : Due to the RAM size of the computer(Using M2 MAC BOOK PRO with 16GB of RAM), The features and the items used for content based filtering is reduced**

## Import modules

In [1]:
import re

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

In [115]:
articles = pd.read_csv("datasets/articles.csv")
transaction = pd.read_csv("datasets/transactions_train.csv")
customer = pd.read_csv("datasets/customers.csv")

In [3]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [4]:
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

### Due to the size of the RAM, use only 5000 articles with most transactions

In [5]:
top5000 = transaction["article_id"].value_counts().head(5000).index

In [6]:
top5000

Index([706016001, 706016002, 372860001, 610776002, 759871002, 464297007,
       372860002, 610776001, 399223001, 706016003,
       ...
       632307011, 775825001, 838321001, 717490081, 695420001, 580482011,
       786657003, 657165002, 822115001, 693479002],
      dtype='int64', name='article_id', length=5000)

In [7]:
top5000_articles = articles[articles["article_id"].isin(top5000)]

## Choose which columns to use as a feature of each item

In [8]:
columns_used = [
    "product_type_name",
    "product_group_name",
    "graphical_appearance_name",
    "colour_group_name",
    "perceived_colour_value_name",
    "department_name",
    "index_name",
    "index_group_name",
    "section_name",
    "garment_group_name",
]

No NA values among these columns

In [9]:
top5000_articles[columns_used].isna().sum()

product_type_name              0
product_group_name             0
graphical_appearance_name      0
colour_group_name              0
perceived_colour_value_name    0
department_name                0
index_name                     0
index_group_name               0
section_name                   0
garment_group_name             0
dtype: int64

In [10]:
for column in columns_used:
    print(
        f"Unique value count on column {column} : {articles[column].dropna().nunique()}"
    )

Unique value count on column product_type_name : 131
Unique value count on column product_group_name : 19
Unique value count on column graphical_appearance_name : 30
Unique value count on column colour_group_name : 50
Unique value count on column perceived_colour_value_name : 8
Unique value count on column department_name : 250
Unique value count on column index_name : 10
Unique value count on column index_group_name : 5
Unique value count on column section_name : 56
Unique value count on column garment_group_name : 21


## Create a One hot encoding Dataframe

In [11]:
onehot_df = top5000_articles[["article_id"]]

for column in columns_used:
    dummy = pd.get_dummies(top5000_articles[column])
    dummy.rename(columns=lambda x: x + "__" + column, inplace=True)
    onehot_df = onehot_df.merge(
        dummy,
        left_index=True,
        right_index=True,
        suffixes=("_onehot", f"_{column}"),
    )

## Check whether a detail desc can be used as a feature column

In [12]:
top5000_articles["detail_desc"].isna().sum()

19

In [13]:
word_dict = {}
for detail_desc in top5000_articles["detail_desc"]:
    if pd.notna(detail_desc):
        cleansed_str = re.sub(r"([a-zA-Z])([\.\,])", r"\1", detail_desc)
        for token in cleansed_str.split():
            if token not in word_dict:
                word_dict[token] = 1
            else:
                word_dict[token] += 1

In [14]:
pd.Series(word_dict).sort_values(ascending=False).head(50)

and            7042
a              6741
with           6611
the            5716
in             4477
at             3717
back           2171
waist          1845
sleeves        1409
top            1362
front          1272
hem            1229
soft           1096
jersey         1085
pockets         967
cuffs           923
straps          823
long            767
shoulder        760
legs            742
cotton          725
fabric          717
weave           694
V-neck          669
an              668
wide            650
lined           641
zip             624
fly             576
neckline        554
narrow          553
Short           552
button          548
buttons         546
bikini          539
that            531
cups            525
side            516
of              511
adjustable      511
viscose         511
elasticated     489
ribbing         489
denim           477
stretch         476
dress           470
high            459
short           455
concealed       452
bust            442


Using the detail desc after removing the stop words may be a good option

Also, there is some NA values. Meaning these items detail desc will be all 0

And we will use a cosine similarity between items, meaning we will check only the angle similarity. So we will only check the occurrence of a word (1 if it was present 0 otherwise)

**Note : Due to the RAM memory issue only use the top 500 words**

## Check words after removing the stop words

In [15]:
with open("stopwords.txt", encoding="utf8") as f:
    stop_words = [word[:-1] for word in f.readlines()]

In [16]:
for stop_word in stop_words:
    if stop_word in word_dict:
        word_dict.pop(stop_word)

pd.Series(word_dict).sort_values(ascending=False).head(500)

back        2171
waist       1845
sleeves     1409
top         1362
front       1272
            ... 
Buttons       12
Playsuit      12
frayed        12
outer         12
Diagonal      12
Length: 500, dtype: int64

## Create a one hot encoding DataFrame using detail desc

In [17]:
top500_words = pd.Series(word_dict).sort_values(ascending=False).head(500).index

In [18]:
top500_words

Index(['back', 'waist', 'sleeves', 'top', 'front', 'hem', 'soft', 'jersey',
       'pockets', 'cuffs',
       ...
       'Zipped', 'peplum', 'showcase', 'round-necked', '20', 'Buttons',
       'Playsuit', 'frayed', 'outer', 'Diagonal'],
      dtype='object', length=500)

In [19]:
len(top500_words)

500

In [20]:
detail_one_hot = pd.DataFrame(0, columns=top500_words, index=top5000_articles.index)

In [21]:
for row in top5000_articles[["detail_desc"]].itertuples():
    if pd.notna(row.detail_desc):
        cleansed_str = re.sub(r"([a-zA-Z])([\.\,])", r"\1", row.detail_desc)
        for token in cleansed_str.split():
            if token in top500_words:
                detail_one_hot.loc[row.Index, token] = 1

In [22]:
detail_one_hot.head()

Unnamed: 0,back,waist,sleeves,top,front,hem,soft,jersey,pockets,cuffs,...,Zipped,peplum,showcase,round-necked,20,Buttons,Playsuit,frayed,outer,Diagonal
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## merge two one hot DataFrame

In [23]:
onehot_df = onehot_df.merge(detail_one_hot, left_index=True, right_index=True)

In [24]:
onehot_df.set_index("article_id", inplace=True)

In [25]:
onehot_df.head()

Unnamed: 0_level_0,Bag__product_type_name,Ballerinas__product_type_name,Beanie__product_type_name,Belt__product_type_name,Bikini top__product_type_name,Blazer__product_type_name,Blouse__product_type_name,Bodysuit__product_type_name,Boots__product_type_name,Bra__product_type_name,...,Zipped,peplum,showcase,round-necked,20,Buttons,Playsuit,frayed,outer,Diagonal
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
108775044,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
111565001,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,1,0,0,0,0,0
111586001,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
111593001,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [26]:
onehot_df.shape

(5000, 829)

## Calculate Cosine Similarity

In [33]:
similarity = pd.DataFrame(
    cosine_similarity(onehot_df),
    columns=top5000_articles["article_id"].to_list(),
    index=top5000_articles["article_id"].to_list(),
)

In [34]:
similarity.head()

Unnamed: 0,108775015,108775044,111565001,111586001,111593001,111609001,120129001,123173001,129085001,146730001,...,915526002,915529001,915529003,916468003,918292001,918292004,918292011,918522001,918547001,924243001
108775015,1.0,0.866667,0.322749,0.266667,0.25049,0.298142,0.269191,0.313112,0.516398,0.276026,...,0.34641,0.235702,0.282843,0.198762,0.092748,0.0,0.0,0.243432,0.110096,0.198762
108775044,0.866667,1.0,0.193649,0.133333,0.125245,0.149071,0.161515,0.187867,0.387298,0.138013,...,0.23094,0.188562,0.188562,0.248452,0.0,0.0,0.092748,0.30429,0.0,0.198762
111565001,0.322749,0.193649,1.0,0.580948,0.606339,0.793857,0.260643,0.606339,0.25,0.601338,...,0.223607,0.136931,0.182574,0.096225,0.134704,0.044901,0.044901,0.117851,0.1066,0.096225
111586001,0.266667,0.133333,0.580948,1.0,0.500979,0.67082,0.323029,0.688847,0.387298,0.759072,...,0.23094,0.141421,0.188562,0.099381,0.185496,0.092748,0.092748,0.121716,0.165145,0.099381
111593001,0.25049,0.125245,0.606339,0.500979,1.0,0.770154,0.303433,0.647059,0.30317,0.583383,...,0.21693,0.132842,0.177123,0.093352,0.174243,0.087121,0.087121,0.114332,0.155126,0.093352


## Checking the result

### Top 10 similarity

In [51]:
similarity.loc[706016001].sort_values(ascending=False).head(10)

706016025    1.00
539723001    1.00
539723042    1.00
706016001    1.00
539723003    0.96
673901001    0.96
706016019    0.96
706016006    0.96
706016015    0.96
539723038    0.96
Name: 706016001, dtype: float64

In [50]:
articles[
    articles["article_id"].isin(
        similarity.loc[706016001].sort_values(ascending=False).head(10).index
    )
][columns_used + ["detail_desc"]]

Unnamed: 0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
10285,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
10287,Trousers,Garment Lower body,Solid,Dark Grey,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
10300,Trousers,Garment Lower body,Solid,Greenish Khaki,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
10303,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
42718,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
53892,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
53896,Trousers,Garment Lower body,Solid,Dark Blue,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
53902,Trousers,Garment Lower body,Solid,Dark Grey,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
53904,Trousers,Garment Lower body,Solid,Dark Blue,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
53907,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...


Because there is a lot of clothes with the same design but with the different pattern or color, Many of the high similarity items have a lot in common

To reduced the redundant items with only different colors or graphics, high similarity items should be removed. This may remove some items which are different, but accidentally have the same similarity. But in clothes, that may be not the case

### Items below similarity 0.8

In [60]:
sims = similarity.loc[706016001].sort_values(ascending=False)
sims_filtered = sims[sims.le(0.8)]

In [65]:
sims_filtered

621381009    0.775672
621381001    0.775672
743626001    0.734847
589222001    0.734847
554450001    0.734130
               ...   
828114002    0.000000
763280008    0.000000
763280006    0.000000
723353002    0.000000
690713010    0.000000
Name: 706016001, Length: 4969, dtype: float64

In [62]:
articles[articles["article_id"].isin(sims_filtered.head(10).index)][
    columns_used + ["detail_desc"]
]

Unnamed: 0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
2192,Trousers,Garment Lower body,Solid,Black,Dark,Denim Trousers,Divided,Divided,Ladies Denim,Trousers Denim,"5-pocket, low-rise jeans in washed, superstret..."
12585,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,5-pocket jeans in superstretch washed denim wi...
17128,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,"5-pocket, ankle-length jeans in washed denim w..."
20458,Trousers,Garment Lower body,Denim,Black,Dark,Denim Trousers,Divided,Divided,Ladies Denim,Trousers Denim,High-waisted jeans in stretch denim with a zip...
27898,Trousers,Garment Lower body,Solid,Light Blue,Light,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
27902,Trousers,Garment Lower body,Solid,Light Blue,Dusty Light,Trousers,Divided,Divided,Divided Collection,Trousers,High-waisted jeans in washed superstretch deni...
40339,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,5-pocket jeans in washed stretch denim with a ...
42708,Trousers,Garment Lower body,Solid,Black,Dark,Trousers,Divided,Divided,Divided Collection,Trousers,5-pocket jeans in washed stretch denim with a ...
64999,Trousers,Garment Lower body,Denim,Black,Dark,Denim Trousers,Divided,Divided,Ladies Denim,Trousers Denim,High-waisted jeans in stretch denim with a zip...
77253,Trousers,Garment Lower body,Solid,Light Blue,Light,Trousers,Divided,Divided,Divided Collection,Trousers,"Jeans in washed, stretch denim with a high wai..."


By lowering the similarity cutoff, we can find see that the items diversity has increased

## Create a simple recommendation function

Using the similarity dataframe from above, I've created a function which gets an input of bought item ids as a list and outputs top 10 item that a user might be interested in

Note that I will use the transaction data and if the item is not present in the similarity dataframe, the item will be ignored

In [94]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [130]:
import random


def print_item_info(item_no, detail_output=False):
    columns_to_print = [
        "prod_name",
        "product_type_name",
        "graphical_appearance_name",
        "colour_group_name",
        "section_name",
        "detail_desc",
    ]

    item_info = articles.loc[
        articles["article_id"].eq(item_no), columns_to_print
    ].squeeze()

    if detail_output:
        print(
            f'- "{item_info["prod_name"]}" which is a "{item_info["product_type_name"]}" with a color pattern of "{item_info["graphical_appearance_name"]} {item_info["colour_group_name"]}", it is a "{item_info["section_name"]}" and a detail description is "{item_info["detail_desc"]}"'
        )
    else:
        print(
            f'- {item_info["prod_name"]} : {item_info["graphical_appearance_name"]} {item_info["colour_group_name"]} {item_info["product_type_name"]}'
        )


def recommend_top10(bought_items, detail_output=False):
    recommend_items = {}

    print("<Bought products info>")
    for item in bought_items:
        print_item_info(item, detail_output)
        if item in similarity.index:
            sims = similarity.loc[item].sort_values(ascending=False)
            sims = sims[~sims.index.isin(bought_items)]
            top10 = sims[sims.le(0.8)].head(10)

            if len(top10) == 0:
                continue

            for item, sim in top10.items():
                recommend_items[item] = (
                    recommend_items.get(item, 0.0) + sim
                )  # accumulate the cosine similarity for duplicate entry items to give more weight

    print("\n\n<Recommended Items>")
    for item in random.sample(
        [
            item[0]
            for item in sorted(
                recommend_items.items(), key=lambda x: x[1], reverse=True
            )
        ][: min(30, len(recommend_items))],
        k=min(10, len(recommend_items)),
    ):  # randomly sample 10 items from the top 30 recommend items for randomness
        print_item_info(item, detail_output)


def recommend_product_to_customer(customer_id):
    customer_transaction = transaction[transaction["customer_id"].eq(customer_id)]

    if len(customer_transaction) == 0:
        print("No transaction made from the customer")
        return

    recommend_top10(
        transaction[transaction["customer_id"].eq(customer_id)]["article_id"].unique()
    )

## Check function

In [135]:
recommend_product_to_customer(customer.sample(1)['customer_id'].iloc[0])

<Bought products info>
- Jaycee dress : All over pattern Dark Blue Dress
- Polly Dress : Solid Light Blue Dress
- Hijack Top : Solid Blue Top
- Fakir : Solid Light Beige Top
- Aster AG : Solid White Blouse
- Campari : Solid Dark Turquoise Dress
- Lee (1) : Solid White Top
- Tilly (1) : Solid White T-shirt
- Agnes LS R-neck : Solid White T-shirt
- Claudine rib t-shirt : Solid White T-shirt
- Lima SS. : Solid White T-shirt
- Claudine t-shirt : Solid White T-shirt
- Becka hoodie : Embroidery White Hoodie
- Cat cotton tee : Solid White T-shirt
- Sage frill playsuit : Solid Light Beige Trousers
- Maria : All over pattern Green Dress


<Recommended Items>
- Simba Tencel : Solid Light Beige Top
- AGNES isw 45 : Solid Black T-shirt
- Cat Tee. : Solid White T-shirt
- Jackey top : Solid Light Beige Top
- Moa tank : Solid White Vest top
- Therese tee : Solid Greenish Khaki T-shirt
- Therese tee : Solid Beige T-shirt
- Therese tee : Solid Light Orange T-shirt
- Lee longsleeve : Solid Black Top
- B