# Recommender System 

 https://developers.google.com/machine-learning/recommendation/overview/terminology

https://www.datacamp.com/tutorial/recommender-systems-python

# 1. Data Acquisition

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
directory_path = "C:\\FinalYearProject\\code\\code\\data"

In [2]:
df = pd.read_csv(r'C:\FinalYearProject\code\code\data\rawData\clustered.csv')

In [3]:
df.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,...,category,sub_category,product_name,sales,quantity,discount,profit,cluster,latitude,longitude
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,...,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,1,32.182598,-95.789318
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,...,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,1,32.182598,-95.789318
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,...,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,4,34.053691,-118.242766
3,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,...,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,3,26.122308,-80.143379
4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,...,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,3,26.122308,-80.143379


# 2. Segregating Data

We will try to perform content based filtering for product recommendations.

In [4]:
#seperating relevant columns for recommendation
product_df = df[['product_id','category','sub_category','product_name']]
product_df

Unnamed: 0,product_id,category,sub_category,product_name
0,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
1,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,..."
2,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...
3,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table
4,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System
...,...,...,...,...
9889,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle
9890,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...
9891,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone
9892,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5"""


In [5]:
product_df['overview'] = product_df['category']+ ' '+  product_df['sub_category']+ ' '+product_df['product_name']
product_df

Unnamed: 0,product_id,category,sub_category,product_name,overview
0,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,Furniture Bookcases Bush Somerset Collection B...
1,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture Chairs Hon Deluxe Fabric Upholstered...
2,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,Office Supplies Labels Self-Adhesive Address L...
3,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,Furniture Tables Bretford CR4500 Series Slim R...
4,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,Office Supplies Storage Eldon Fold 'N Roll Car...
...,...,...,...,...,...
9889,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle,Furniture Furnishings Ultra Door Pull Handle
9890,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,Furniture Furnishings Tenex B1-RE Series Chair...
9891,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,Technology Phones Aastra 57i VoIP phone
9892,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",Office Supplies Paper It's Hot Message Books w...


In [6]:
product_df = product_df.drop_duplicates()

In [7]:
product_df = product_df.drop(columns="category")

In [8]:
product_df['overview'] = product_df['overview'].apply(lambda x:x.lower())

In [9]:
product_df.head()

Unnamed: 0,product_id,sub_category,product_name,overview
0,FUR-BO-10001798,Bookcases,Bush Somerset Collection Bookcase,furniture bookcases bush somerset collection b...
1,FUR-CH-10000454,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",furniture chairs hon deluxe fabric upholstered...
2,OFF-LA-10000240,Labels,Self-Adhesive Address Labels for Typewriters b...,office supplies labels self-adhesive address l...
3,FUR-TA-10000577,Tables,Bretford CR4500 Series Slim Rectangular Table,furniture tables bretford cr4500 series slim r...
4,OFF-ST-10000760,Storage,Eldon Fold 'N Roll Cart System,office supplies storage eldon fold 'n roll car...


# 3. Vectorization

a. TF IDF

TF IDF vectorizer stands for Term Frequency- Inverse document Frequency.

https://www.datacamp.com/tutorial/recommender-systems-python :
the TF-IDF score is the frequency of a word occurring in a document, down-weighted by the number of documents in which it occurs.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

product_df['overview'] = product_df['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(product_df['overview'])

tfidf_matrix.shape

(1880, 2475)

Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [11]:
from sklearn.metrics.pairwise import  linear_kernel

cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [12]:
cosine_sim.shape

(1880, 1880)

In [13]:
def recommend_product(productName, cosine_sim,product_df):
    idx = product_df[product_df['product_name']==productName].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1],reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]

    return product_df['product_name'].iloc[product_indices]

In [14]:
recommend_product("Bush Somerset Collection Bookcase",cosine_sim,product_df)

3476     Bush Birmingham Collection Bookcase, Dark Cherry
896     Bush Westfield Collection Bookcases, Fully Ass...
288     Bush Westfield Collection Bookcases, Medium Ch...
2518    Bush Westfield Collection Bookcases, Dark Cher...
3070     Bush Cubix Collection Bookcases, Fully Assembled
1100    Bush Saratoga Collection 5-Shelf Bookcase, Han...
7887    Bush Westfield Collection Bookcases, Dark Cher...
678      Bush Andora Bookcase, Maple/Graphite Gray Finish
3874    Bush Heritage Pine Collection 5-Shelf Bookcase...
1231                              Bestar Classic Bookcase
Name: product_name, dtype: object

https://medium.com/@maziarizadi/pickle-your-model-in-python-2bbe7dba2bbb

In [15]:
import pickle
data = (tfidf_matrix, cosine_sim, product_df)
with open(os.path.join(directory_path, "TFrecommender.pkl"), 'wb') as f:
    pickle.dump(data, f)

b. Count vectorizer 

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=3000,stop_words='english')
X = vectorizer.fit_transform(product_df['overview']).toarray()

In [17]:
vectorizer.get_feature_names_out()

array(['002', '002974', '05222', ..., 'zipper', 'zl1810', 'zoom'],
      dtype=object)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_mat = cosine_similarity(X)


In [26]:
d = pd.DataFrame(similarity_mat)
d.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879
0,1.0,0.123091,0.0,0.144338,0.0,0.117851,0.0,0.0,0.0,0.0,...,0.0,0.136083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.123091,1.0,0.0,0.1066,0.0,0.087039,0.0,0.0,0.0,0.0,...,0.0,0.301511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.227921,0.0,0.26968,0.0,0.174078,0.227921,...,0.870388,0.100504,0.213201,0.0,0.0,0.0,0.0,0.227921,0.0,0.181818
3,0.144338,0.1066,0.0,1.0,0.0,0.102062,0.0,0.0,0.0,0.0,...,0.0,0.117851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.227921,0.0,1.0,0.109109,0.338062,0.0,0.218218,0.285714,...,0.218218,0.125988,0.267261,0.0,0.0,0.0,0.0,0.285714,0.0,0.455842


In [20]:
similarity_mat[1]

array([0.12309149, 1.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [21]:
sorted(list(enumerate(similarity_mat[0])),reverse=True,key=lambda x:x[1])[1:10]

[(1554, np.float64(0.7216878364870323)),
 (712, np.float64(0.6454972243679029)),
 (1495, np.float64(0.6454972243679029)),
 (265, np.float64(0.6154574548966638)),
 (830, np.float64(0.6154574548966638)),
 (1373, np.float64(0.6154574548966638)),
 (1847, np.float64(0.5661385170722979)),
 (897, np.float64(0.5477225575051662)),
 (573, np.float64(0.5443310539518174))]

In [22]:
def product_recommend(prod_name):
    prodIndex  = product_df[product_df['product_name']==prod_name].index[0]
    distances_ = similarity_mat[prodIndex]
    prod_list = sorted(list(enumerate(distances_)),reverse=True,key=lambda x:x[1])[1:10]
    for i in prod_list:
        print(product_df.iloc[i[0]].product_name)
    

In [23]:
product_recommend("Bush Somerset Collection Bookcase")

Bush Birmingham Collection Bookcase, Dark Cherry
Bush Westfield Collection Bookcases, Fully Assembled
Bush Cubix Collection Bookcases, Fully Assembled
Bush Westfield Collection Bookcases, Medium Cherry Finish
Bush Saratoga Collection 5-Shelf Bookcase, Hanover Cherry, *Special Order
Bush Westfield Collection Bookcases, Dark Cherry Finish
Bush Westfield Collection Bookcases, Dark Cherry Finish, Fully Assembled
Bestar Classic Bookcase
Bush Andora Bookcase, Maple/Graphite Gray Finish


In [24]:
with open(os.path.join(directory_path, "prodList.pkl"), 'wb') as f:
    pickle.dump(product_df.to_dict(), f)

with open(os.path.join(directory_path, "similarity.pkl"), 'wb') as f:
    pickle.dump(similarity_mat, f)