In [None]:
!pip install turicreate

Collecting turicreate
[?25l  Downloading https://files.pythonhosted.org/packages/e4/76/76c624d7ae1116b22cd559288596a1f9aa7a50f8f43f4481033fc047f5e9/turicreate-6.3-cp36-cp36m-manylinux1_x86_64.whl (91.9MB)
[K     |████████████████████████████████| 91.9MB 89kB/s 
Collecting coremltools==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/77/19/611916d1ef326d38857d93af5ba184f6ad7491642e0fa4f9082e7d82f034/coremltools-3.3-cp36-none-manylinux1_x86_64.whl (3.4MB)
[K     |████████████████████████████████| 3.4MB 40.9MB/s 
Collecting tensorflow<=2.0.1,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/43/16/b07e3f7a4a024b47918f7018967eb984b0c542458a6141d8c48515aa81d4/tensorflow-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 50kB/s 
Collecting resampy==0.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/14/b6/66a06d85474190b50aee1a6c09cdc95bb405ac47338b27e9b21409da1760/resampy-0.2.1.tar.gz (322kB)


In [None]:
import pandas as pd
import turicreate as tc
from turicreate import SFrame, SArray
from turicreate import ranking_factorization_recommender
import numpy as np
from sklearn.decomposition import *

In [None]:
%cd "/content/drive/My Drive/Recommender Systems/Data"

/content/drive/My Drive/Recommender Systems/Data


## Preprocessing

In [None]:
df = pd.read_pickle("final_dataset_merged_S2V.pkl")
tags = pd.read_csv("occasion_tags.csv")

In [None]:
def transform_cat(x):
  y = x.strip("['").strip("]'").split("', '")
  # print(x)
  y.remove("Clothing")
  y.remove("Shoes & Jewelry")
  return y

In [None]:
df["categories"] = df["categories"].apply(transform_cat)
tags = tags.rename(columns={"Unnamed: 0": "item_id"})

In [None]:
df = pd.merge(df, tags, how="left", on="item_id")

## PCA

In [None]:
records = df[["embedding", "item_id"]].to_records(index=False)

In [None]:
# e stores 2048 embeddings and i_id stores the item_ids from all the interactions
e = []
i_id = []

# this is to remove duplicated item_ids and pairings. 
# there are 23033 items in total
for i in records:
  if i[1] not in i_id:
    i_id.append(i[1])
    e.append(np.array(i[0]))

e = np.array(e)
print(f"There are {len(e)} items and each item has {len(e[0])} embeddings")

There are 23033 items and each item has 2048 embeddings


In [None]:
pca1 = KernelPCA(n_components=200, kernel="cosine")
e3 = pca1.fit_transform(e)

# temp would store all the 200 embeddings of each item in a dictionary format
temp = []
for i in e3:
  d = {}
  for x in range(len(i)):
    d[x] = i[x]
  temp.append(d)

# Ranking Factorisation Machine

In [None]:
df["price"] = df["price"].fillna(0)
df["title"] = df["title"].fillna("missing")
df["brand"] = df["brand"].fillna("missing")
df["occasion_tag"] = df["occasion_tag"].fillna("missing")

## Main Model (with Style2Vec Embeddings + Occasion & Mood

In [None]:
# interactions to sframe
user_id = SArray(df.user_id.tolist())
item_id = SArray(df.item_id.tolist())
label = SArray(df.rating.tolist())
occasion_tag = SArray(df.occasion_tag.tolist())

In [None]:
sf_w_occasion = SFrame({'user_id':user_id,'item_id':item_id, 'rating': label, 
                        'occasion_tag': occasion_tag})

In [None]:
item_dict = {'item_id': i_id, "embed": SArray(temp)}
item_sf = SFrame(item_dict)

In [None]:
item_sf

item_id,embed
0000031887,"{0: -0.41994696855545044, 1: 0.035373806953430176, ..."
0123456479,"{0: -0.2087264508008957, 1: -0.04172654449939728, ..."
1608299953,"{0: 0.4019678831100464, 1: 0.6715244054794312 ..."
1617160377,"{0: 0.40174588561058044, 1: 0.6386615633964539 ..."
B00001W0KA,"{0: -0.29907581210136414, 1: -0.03397293761372566, ..."
B00001WRHJ,"{0: -0.370714396238327, 1: -0.005181174725294 ..."
B00004SR8W,"{0: 0.22734412550926208, 1: 0.22161072492599487, ..."
B00004SR8Z,"{0: -0.330423504114151, 1: -0.05777539312839508, ..."
B00004SR9P,"{0: -0.1753242015838623, 1: -0.1168707087635994, ..."
B00004U1J2,"{0: 0.048044417053461075, 1: -0.20292368531227112, ..."


In [None]:
# Create a recommender-friendly train-test split of the provided data set.
sf_train, sf_test = tc.recommender.util.random_split_by_user(sf_w_occasion,item_test_proportion=0.2, 
                                                             random_seed =10)

### Training 

In [None]:
m1 = ranking_factorization_recommender.create(sf_train, user_id="user_id", item_id="item_id", 
                                              item_data=item_sf,
                                              target="rating",side_data_factorization=True,
                                             max_iterations=50, num_factors=64, ranking_regularization=0.1)

### Score

In [None]:
m1.evaluate_precision_recall(sf_test, cutoffs=[10,20,30,40,50])

{'precision_recall_by_user': Columns:
 	user_id	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 2580
 
 Data:
 +----------------+--------+-----------+--------+-------+
 |    user_id     | cutoff | precision | recall | count |
 +----------------+--------+-----------+--------+-------+
 | A2L7K5U87UVHJ7 |   10   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   20   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   30   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   40   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   50   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   10   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   20   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   30   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   40   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   50   |    0.0    |  0.0   |   1   |
 +----------------+--------+-----------+--------+-------+
 [2580 rows x 5 columns]
 Note: Only the head of the SF

### Recommendation

In [None]:
# save model weights
m1.save("final_model")

In [None]:
# load back saved model
m1 = tc.load_model('final_model')

In [None]:
obs_dict = {'user_id': SArray(["A2NOW4U7W3F7RI"]),'occasion_tag': SArray(["occasion_neutral_sports"])}
obs = SFrame(obs_dict)

In [None]:
m1.recommend(obs, k=10, diversity=0)

user_id,item_id,score,rank
A2NOW4U7W3F7RI,B001Q5QLP6,5.92846608351257,1
A2NOW4U7W3F7RI,B00178UOLY,5.717870354995661,2
A2NOW4U7W3F7RI,B002NY1OJW,5.6348506544272965,3
A2NOW4U7W3F7RI,B000EX15NY,5.571941606512072,4
A2NOW4U7W3F7RI,B004NDOIJ4,5.571692540372526,5
A2NOW4U7W3F7RI,B00BNB36ZG,5.544816355288212,6
A2NOW4U7W3F7RI,B002JCSX4M,5.532332963165972,7
A2NOW4U7W3F7RI,B00874LMGW,5.513080568282296,8
A2NOW4U7W3F7RI,B00AW80P28,5.454299512365724,9
A2NOW4U7W3F7RI,B005GYGD7O,5.445043895746476,10


In [None]:
obs_dict = {'user_id': SArray(["A3NHUQ33CFH3VM"]),'occasion_tag': SArray(["occasion_neutral_casual"])}
obs = SFrame(obs_dict)

In [None]:
m1.recommend(obs, k=10, diversity=0)

user_id,item_id,score,rank
A3NHUQ33CFH3VM,B004LKXG9W,4.607024920371797,1
A3NHUQ33CFH3VM,B005UVM368,4.29332266097938,2
A3NHUQ33CFH3VM,B007RXN6J0,4.2368252267428455,3
A3NHUQ33CFH3VM,B003H10ORI,4.224797956459736,4
A3NHUQ33CFH3VM,B002IY1WEE,4.124827817978886,5
A3NHUQ33CFH3VM,B000U83I1A,4.099490174740814,6
A3NHUQ33CFH3VM,B008O7UPKG,4.052719112381711,7
A3NHUQ33CFH3VM,B005KSZHHI,3.983750823936118,8
A3NHUQ33CFH3VM,B0072J72AI,3.9665752048170777,9
A3NHUQ33CFH3VM,B001LNIL7E,3.938680668092707,10


In [None]:
obs_dict = {'user_id': SArray(["A1KLRMWW2FWPL4"]),'occasion_tag': SArray(["occasion_happy_casual"])}
obs = SFrame(obs_dict)

In [None]:
m1.recommend(obs, k=10, diversity=0)

user_id,item_id,score,rank
A1KLRMWW2FWPL4,B008B0ZWTA,5.407314690571187,1
A1KLRMWW2FWPL4,B000ARPN5A,5.262056657456979,2
A1KLRMWW2FWPL4,B002SG8E98,5.1906631018972895,3
A1KLRMWW2FWPL4,B0001DYW64,5.10275008800193,4
A1KLRMWW2FWPL4,B001CGW432,5.056280234511952,5
A1KLRMWW2FWPL4,B002Q7CDR8,5.05239823865958,6
A1KLRMWW2FWPL4,B0009GA6SQ,4.998737648908038,7
A1KLRMWW2FWPL4,B0021ID9IY,4.992569272110135,8
A1KLRMWW2FWPL4,B008BTO8RI,4.972753321505589,9
A1KLRMWW2FWPL4,B00DV9XRPM,4.933755069133988,10


## Comparison Model (without Style2Vec Embeddings + Occasion & Mood)

In [None]:
# interactions to sframe
user_id = SArray(df.user_id.tolist())
item_id = SArray(df.item_id.tolist())
label = SArray(df.rating.tolist())
sf = SFrame({'user_id':user_id,'item_id':item_id, 'rating': label})

In [None]:
# Create a recommender-friendly train-test split of the provided data set.
sf_train, sf_test = tc.recommender.util.random_split_by_user(sf,item_test_proportion=0.1, random_seed =10)

### Training

In [None]:
m2 = tc.ranking_factorization_recommender.create(sf_train, user_id="user_id", item_id="item_id", 
                                              target="rating", max_iterations=50, num_factors=64,
                                              ranking_regularization = 0.1, solver="adagrad", random_seed=10)

### Score

In [None]:
m2.evaluate_precision_recall(sf_test, cutoffs=[10,20,30,40,50])

{'precision_recall_by_user': Columns:
 	user_id	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 2580
 
 Data:
 +----------------+--------+-----------+--------+-------+
 |    user_id     | cutoff | precision | recall | count |
 +----------------+--------+-----------+--------+-------+
 | A2L7K5U87UVHJ7 |   10   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   20   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   30   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   40   |    0.0    |  0.0   |   1   |
 | A2L7K5U87UVHJ7 |   50   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   10   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   20   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   30   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   40   |    0.0    |  0.0   |   1   |
 | A279F26TAKYWMA |   50   |    0.0    |  0.0   |   1   |
 +----------------+--------+-----------+--------+-------+
 [2580 rows x 5 columns]
 Note: Only the head of the SF

### Recommendation

In [None]:
# save model weights
m2.save("baseline_factorisation")

In [None]:
# load back saved model
m2 = tc.load_model("baseline_factorisation")

In [None]:
obs_dict = {'user_id': SArray(["A2NOW4U7W3F7RI"])}
obs = SFrame(obs_dict)

In [None]:
m2.recommend(obs, k=10, diversity=0)

user_id,item_id,score,rank
A2NOW4U7W3F7RI,B000O32MLI,5.540872793948092,1
A2NOW4U7W3F7RI,B000AYYIYU,4.881638270174898,2
A2NOW4U7W3F7RI,B000AOZJZM,4.713524442469515,3
A2NOW4U7W3F7RI,B00075ZYRW,4.6987402255887165,4
A2NOW4U7W3F7RI,B00DMZOSXK,4.669031601702608,5
A2NOW4U7W3F7RI,B009ZQ3XE0,4.630698662554659,6
A2NOW4U7W3F7RI,B004AFNM7O,4.621831637179293,7
A2NOW4U7W3F7RI,B00401VJ32,4.607583623682894,8
A2NOW4U7W3F7RI,B009OKPR96,4.599393111025728,9
A2NOW4U7W3F7RI,B0011ZX636,4.591355782305636,10


In [None]:
obs_dict = {'user_id': SArray(["A3NHUQ33CFH3VM"])}
obs = SFrame(obs_dict)

In [None]:
m2.recommend(obs, k=10, diversity=0)

user_id,item_id,score,rank
A3NHUQ33CFH3VM,B00422MCUS,5.597339373385347,1
A3NHUQ33CFH3VM,B000ARPN46,5.191772204195894,2
A3NHUQ33CFH3VM,B005VNFCZO,5.139370184695162,3
A3NHUQ33CFH3VM,B0051U15E4,5.011927347933687,4
A3NHUQ33CFH3VM,B007WNWEFC,4.827659350191988,5
A3NHUQ33CFH3VM,B0047N0ZYW,4.8116740997189655,6
A3NHUQ33CFH3VM,B0030CM2YC,4.779813032900728,7
A3NHUQ33CFH3VM,B0010ODJNK,4.751229744707979,8
A3NHUQ33CFH3VM,B0015MN8QU,4.648667317187227,9
A3NHUQ33CFH3VM,B00BZHXRPC,4.647022705828585,10
