In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
import re    # for regular expressions
from string import punctuation
from nltk.stem import SnowballStemmer    #if you are brave enough to do stemming
from nltk.corpus import stopwords      #if you want to remove stopwords
from nltk.tokenize import word_tokenize
import string
from collections import Counter

### Data visualization

In [33]:
# Import products

products = pd.read_csv('products_train.csv')
products.head()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Geräte mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,【Auto aufziehbar】: Drücken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 Stück


In [34]:
# Import sessions

sessions = pd.read_csv('sessions_train.csv')
sessions.head()

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE


In [35]:
# temp_session = sessions.copy()

In [36]:
sessions_eng = sessions[sessions['locale']=='UK']

In [37]:
sessions_eng.shape

(1182181, 3)

In [7]:
def clean_items(text):

    text = text[1:-1]
    text = re.findall(r"'([^']*)'", text)

    return text

In [8]:
sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(clean_items)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(clean_items)


In [9]:
interacted_items = list(sessions_eng['prev_items'])
# interacted_items

In [10]:
unique_items = list(set(i for j in interacted_items for i in j))
# unique_items

In [11]:
len(unique_items)

470148

In [12]:
# Check unique locations

products['locale'].unique()

array(['DE', 'JP', 'UK', 'ES', 'FR', 'IT'], dtype=object)

In [13]:
# Exctract the English products only (UK)

products_eng = products[products['locale']=='UK']

In [14]:
products_eng = products_eng[products_eng['id'].isin(unique_items)]

In [15]:
# from sklearn.model_selection import train_test_split

In [16]:
# products_eng_used, products_eng_unused = train_test_split(products_eng, train_size=0.5, random_state=42)

In [17]:
# Check the number of products in the English set

# products_eng.size

In [18]:
# Exctract the English sessions only (UK)

# sessions_eng = sessions[sessions['locale']=='UK']

In [19]:
# Check the number of sessions in the English set

# sessions_eng.size

We see that we have more items than distinct sessions

### Clean

In [15]:
import re
import nltk

In [16]:
titles = np.array(products_eng['title'])
titles = " ".join(titles)

In [17]:
words = re.findall(r'\w+', titles)

In [18]:
word_counter = Counter(words)

In [19]:
word_frequencies = np.array(list(word_counter.values()))

In [20]:
word_quartiles = np.quantile(word_frequencies, [0,.45,0.5])
word_quartiles

array([1., 1., 2.])

In [21]:
len(word_frequencies)

176991

In [22]:
word_freq_2_more = [k for k,v in word_counter.items() if float(v) >= 2]

In [23]:
len(word_freq_2_more)

94355

In [24]:
big_count = 0

def clean(text, stem_words=True):
    
#     text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
#     print(text)
#     text = re.sub('^(?!.[a-zA-Z])(?!.\d)[a-zA-Z0-9]+$', "", text)
    text = re.findall(r'\b(?:[a-zA-Z]+|\d+)\b', text)

    new_text = []
    for word in text:
        if word_counter[word] > 1:
            new_text.append(word)
    text = new_text

#     global big_count
#     big_count += 1
#     print(big_count)
    
    text = " ".join(text)
    
    return text

In [25]:
products_eng['title'] = products_eng['title'].apply(clean)

### Model creation

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

Problems:
- items with no description

Need to do:
- normalize text (make undercase, no italic, no bold)

In [27]:
# Train a model that finds similar items based on the title of the items only
# https://www.datacamp.com/tutorial/recommender-systems-python

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# #Replace NaN with an empty string
# products_eng['title'] = products_eng['title'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(products_eng['title'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(470148, 62117)

In [34]:
#Array mapping from feature integer indices to feature name

# tfidf.get_feature_names_out()[-1000:]

In [29]:
from sklearn.neighbors import NearestNeighbors

In [None]:
model = NearestNeighbors(n_neighbors=n_neighbor,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)

In [116]:
# # https://github.com/AlexanderNixon/Machine-learning-reads/blob/master/Movie-content-based-recommender-using-tf-idf.ipynb

# cosine_sim = cosine_similarity(tfidf_matrix)

In [117]:
# tfidf_matrix = tfidf_matrix.astype('float32')

In [42]:
item_index_to_matrix_index = {k: v for v, k in enumerate(products_eng.index)}

In [45]:
matrix_index_to_item_index = {v: k for k, v in item_index_to_matrix_index.items()}

In [112]:
def get_closest_item(item_id, n):
    
    """
    Input: id of the item corresponding to the original dataframe
    Output: list of top n most similar items, excluding the input item, indexes from the original dataframe
    """
    
    matrix_item_id = item_index_to_matrix_index[item_id]
    item_vector = tfidf_matrix.getrow(matrix_item_id)
    similarity_vector = cosine_similarity(item_vector, tfidf_matrix)[0]
    
    ind = similarity_vector.argsort()[-(n+1):][::-1]
    ind = np.delete(ind, np.where(ind == matrix_item_id))
    ind = ind[:n]
    
    dataframe_idexes = []
    similarities = []
    
    for index in ind:
        similarities.append(similarity_vector[index])
        dataframe_idexes.append(matrix_index_to_item_index[index])
        
    return dataframe_idexes, similarities

In [278]:
def get_closest_item(item_id):
    
    """
    Input: id of the item corresponding to the original dataframe
    Output: list of top n most similar items, excluding the input item, indexes from the original dataframe
    """
    n = 1
    
    matrix_item_id = item_index_to_matrix_index[item_id]
    item_vector = tfidf_matrix.getrow(matrix_item_id)
    similarity_vector = linear_kernel(item_vector, tfidf_matrix)[0]
    
    ind = similarity_vector.argsort()[-2:][::-1]
    ind = np.delete(ind, np.where(ind == matrix_item_id))
    ind = ind[:1]
    
    dataframe_idexes = []
    similarities = []
    
    for index in ind:
        similarities.append(similarity_vector[index])
        dataframe_idexes.append(matrix_index_to_item_index[index])
        
    return matrix_index_to_item_index[ind[0]]

In [219]:
get_closest_item(913340)

1263071

In [113]:
get_closest_item(913336,2)

([1339655, 1051538], [0.9542421315568143, 0.9146442927057199])

### Test recommendations

In [164]:
test_sessions_eng = sessions_eng.copy()

In [165]:
comercial_id_to_title = dict(zip(products_eng.id, products_eng.title))

In [166]:
def from_commercial_id_to_title(commercial_id):
    
    if commercial_id not in comercial_id_to_title:
        return np.nan
        
    dataframe_row = comercial_id_to_title[commercial_id]
    dataframe_row
    
    return dataframe_row

In [167]:
test_sessions_eng['next_item'] = test_sessions_eng['next_item'].apply(from_commercial_id_to_title)

In [168]:
test_sessions_eng

Unnamed: 0,prev_items,next_item,locale
2090535,"[B0BFDL54Y7, B0BFDR9X13, B07J4WF8VH, B07Y21LDJX]",ADOV Vegetable Chopper 14 in 1 Multi Function ...,UK
2090536,"[B07FM2GLNQ, B07GZW3P4W]",Bedsure White Voile Sheer Curtains 2 Panels Wi...,UK
2090537,"[B0021L95HU, B07DDL77RY, B07DDL77RY]",Seven Seas Omega 3 FISH Oil Plus Cod Liver Oil...,UK
2090538,"[B0B2WSZYL2, B000I8XZ7O]",Rubie s Official Halloween Haunted House Skele...,UK
2090539,"[B00LW1APOC, B00LW1APOC, B00OI6NQUI, B09HLDN8W1]",Command Assorted Picture Hanging Strips White ...,UK
...,...,...,...
3272711,"[B06XK89969, B01NGT5NF4, B00D5Z89C8, B07ZVGCHR...",MYCARBON Travel Money Belt RFID Against Invisi...,UK
3272712,"[B076M85W1K, B07L8792Q9, B095RW318L, B095RVVX9T]",Laptop Backpack Womens 15 6 Inch School Bag wi...,UK
3272713,"[B00JQDIQRQ, B001O59QQE]",Zinc Carbon Triple AAA Batteries 1 Heavy Duty ...,UK
3272714,"[B07QMHMLJZ, B07FPYYMC4]",El Malbec Argentinian Red Wine 1 x,UK


In [169]:
item_index_to_item_commercial_id = dict(zip(products_eng.index, products_eng.id))

In [171]:
item_commercial_id_to_item_index = {v: k for k, v in item_index_to_item_commercial_id.items()}

In [279]:
def recommend_most_similar_for_last_item(item_list):
    
    last_item_commercial_id = item_list[-1]
    last_item_commercial_index = item_commercial_id_to_item_index[last_item_commercial_id]
#     recommended_items = get_closest_item(last_item_commercial_index, 1)
#     recommended_items = [[913336]]
#     recommended_items = recommended_items[0][0]
    recommended_items = get_closest_item(last_item_commercial_index)
    recommended_items = item_index_to_item_commercial_id[recommended_items]
#     print(recommended_items)
    if recommended_items not in comercial_id_to_title:
        return np.nan
    title = comercial_id_to_title[recommended_items]
    
    global count
    count += 1
    print(count, end='\r')
    
    return title

In [330]:
train_test_sessions_eng, test_test_sessions_eng = train_test_split(test_sessions_eng, train_size=0.001)
train_test_sessions_eng

Unnamed: 0,prev_items,next_item,locale
3143235,"[B08TCCVS28, B09QMJ87N7]",Busy B Mid Year Busy Life Diary August 2022 Au...,UK
2980385,"[B00EVNXR66, B07GZD6DVQ, B00EVNXR66, B00EVNXR66]",RPM Power Door Pull Up Bars Heavy Duty Door Pu...,UK
2622482,"[B0B18XBHS4, B089KVXX4B, B0B2ZZ3L8Q, B07BTR535...",Utopia Bedding Pillowcases Soft Brushed Microf...,UK
2329429,"[B09VBJ1HSM, B091CNZYZZ]",LED Desk Lamp Reading Lamp with Touch Control ...,UK
2511697,"[B08299JR38, B094KNYPVT]",Godmorn Humane Mouse Trap 2 Pack with Cleaning...,UK
...,...,...,...
2939488,"[B07CL2RHX5, B075V5JK36, B083KQNHQH]",Amazon Basics USB C to HDMI adapter cable Thun...,UK
2826050,"[B09ZTKZ3LN, B08JJ3VCXL, B08SHYRYZV, B08JJ51D7R]",Bra Extender 3 Hook Stretchy Bra Strap Extensi...,UK
2832100,"[B08DCTDWTF, B091XQZL4X, B091XWLZ7B, B091Y3T2P...",Micoden for iPhone 12 Case Cute Girls Silicone...,UK
2555307,"[B004RDYI04, B004RDYI04]",2 x Nose Spray Long Lasting Fast Acting,UK


In [331]:
count = 0
train_test_sessions_eng['prev_items'] = train_test_sessions_eng['prev_items'].apply(recommend_most_similar_for_last_item)

1182

In [332]:
train_test_sessions_eng

Unnamed: 0,prev_items,next_item,locale
3143235,Busy B Mid Year Busy Life Diary August 2022 Au...,Busy B Mid Year Busy Life Diary August 2022 Au...,UK
2980385,adidas Duffle XS Gym Bag Black Black White NS,RPM Power Door Pull Up Bars Heavy Duty Door Pu...,UK
2622482,Silentnight Deep Sleep Pillow Pack of 6 Soft H...,Utopia Bedding Pillowcases Soft Brushed Microf...,UK
2329429,LED Desk Lamp Dimmable Eye Caring Table Lamp w...,LED Desk Lamp Reading Lamp with Touch Control ...,UK
2511697,KEPLIN Humane Mouse Trap No Kill the Mice Pets...,Godmorn Humane Mouse Trap 2 Pack with Cleaning...,UK
...,...,...,...
2939488,Amazon Basics USB C to HDMI adapter cable Thun...,Amazon Basics USB C to HDMI adapter cable Thun...,UK
2826050,Bra Extender 3 Hook Elastic Bra Strap Extensio...,Bra Extender 3 Hook Stretchy Bra Strap Extensi...,UK
2832100,Micoden for iPhone 12 Case Cute Girls Silicone...,Micoden for iPhone 12 Case Cute Girls Silicone...,UK
2555307,Sudafed Blocked Nose Spray x 4 Multipack,2 x Nose Spray Long Lasting Fast Acting,UK


In [333]:
train_test_sessions_eng.dropna(subset=['next_item'], inplace=True)

In [334]:
train_test_sessions_eng.dropna(subset=['prev_items'], inplace=True)

In [335]:
train_test_sessions_eng

Unnamed: 0,prev_items,next_item,locale
3143235,Busy B Mid Year Busy Life Diary August 2022 Au...,Busy B Mid Year Busy Life Diary August 2022 Au...,UK
2980385,adidas Duffle XS Gym Bag Black Black White NS,RPM Power Door Pull Up Bars Heavy Duty Door Pu...,UK
2622482,Silentnight Deep Sleep Pillow Pack of 6 Soft H...,Utopia Bedding Pillowcases Soft Brushed Microf...,UK
2329429,LED Desk Lamp Dimmable Eye Caring Table Lamp w...,LED Desk Lamp Reading Lamp with Touch Control ...,UK
2511697,KEPLIN Humane Mouse Trap No Kill the Mice Pets...,Godmorn Humane Mouse Trap 2 Pack with Cleaning...,UK
...,...,...,...
2939488,Amazon Basics USB C to HDMI adapter cable Thun...,Amazon Basics USB C to HDMI adapter cable Thun...,UK
2826050,Bra Extender 3 Hook Elastic Bra Strap Extensio...,Bra Extender 3 Hook Stretchy Bra Strap Extensi...,UK
2832100,Micoden for iPhone 12 Case Cute Girls Silicone...,Micoden for iPhone 12 Case Cute Girls Silicone...,UK
2555307,Sudafed Blocked Nose Spray x 4 Multipack,2 x Nose Spray Long Lasting Fast Acting,UK


In [336]:
from nltk.translate.bleu_score import corpus_bleu

In [337]:
hypotheses = list(train_test_sessions_eng['prev_items'])
hypotheses = [k.split() for k in hypotheses]

In [338]:
list_of_references = list(train_test_sessions_eng['next_item'])
list_of_references = [[k.split()] for k in list_of_references]

In [339]:
corpus_bleu(list_of_references, hypotheses)

0.3114371278747136

<p>Cosine + whole dataset item x item + 0.001 test set from eng sessions<br><br>
    0.3345777847638878<br>
    0.2944351033248738<br>
    0.31359431393622406<br>
    0.30993238217722233<br>
    0.29221329254907863<br>
    0.2931361051935785<br>
    0.286349284129298<br>
    0.3306727165724064<br>
    0.33464147988807214<br>
    0.3114371278747136<br>
</p>

In [39]:
(0.309932+0.334577+0.294435+0.313594+0.292213+0.293136+0.286349+0.330672+0.334641+0.311437)/10

0.3100986

For report:

- more than half of words used only once
- a lot of products which are not rented
- hard to create a matrix of similarity with whole product dataset -> hard to test on the sessions data