In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import re    # for regular expressions
from string import punctuation
from nltk.stem import SnowballStemmer    #if you are brave enough to do stemming
from nltk.corpus import stopwords      #if you want to remove stopwords
from nltk.tokenize import word_tokenize
import string
from collections import Counter

In [4]:
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.similarities.annoy import AnnoyIndexer

### Data visualization

In [5]:
# Import products

products = pd.read_csv('products_train.csv')
products.head()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Geräte mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,【Auto aufziehbar】: Drücken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 Stück


In [6]:
# Import sessions

sessions = pd.read_csv('sessions_train.csv')
sessions.head()

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE


In [7]:
sessions_eng = sessions[sessions['locale']=='UK']

In [8]:
def clean_items(text):

    text = text[1:-1]
    text = re.findall(r"'([^']*)'", text)

    return text

In [9]:
sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(clean_items)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(clean_items)


In [10]:
interacted_items = list(sessions_eng['prev_items'])
# interacted_items

In [11]:
unique_items = list(set(i for j in interacted_items for i in j))
# unique_items

In [12]:
len(unique_items)

470148

In [13]:
# Check unique locations

products['locale'].unique()

array(['DE', 'JP', 'UK', 'ES', 'FR', 'IT'], dtype=object)

In [14]:
# Exctract the English products only (UK)

products_eng = products[products['locale']=='UK']

In [15]:
products_eng = products_eng[products_eng['id'].isin(unique_items)]

We see that we have more items than distinct sessions

### Clean

In [16]:
import re
import nltk

In [17]:
titles = np.array(products_eng['title'])
titles = " ".join(titles)

In [18]:
words = re.findall(r'\w+', titles)

In [19]:
word_counter = Counter(words)

In [20]:
word_frequencies = np.array(list(word_counter.values()))

In [21]:
word_quartiles = np.quantile(word_frequencies, [0,.45,0.5])
word_quartiles

array([1., 1., 2.])

In [22]:
len(word_frequencies)

176991

In [23]:
word_freq_2_more = [k for k,v in word_counter.items() if float(v) >= 2]

In [24]:
len(word_freq_2_more)

94355

In [25]:
big_count = 0

def clean(text, stem_words=True):
    
#     text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
#     print(text)
#     text = re.sub('^(?!.[a-zA-Z])(?!.\d)[a-zA-Z0-9]+$', "", text)
    text = re.findall(r'\b(?:[a-zA-Z]+|\d+)\b', text)

    new_text = []
    for word in text:
        if word_counter[word] > 1:
            new_text.append(word)
    text = new_text

#     global big_count
#     big_count += 1
#     print(big_count)
    
    text = " ".join(text)
    
    return text

In [26]:
products_eng['title'] = products_eng['title'].apply(clean)

### Model creation

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

Problems:
- items with no description

Need to do:
- normalize text (make undercase, no italic, no bold)

In [28]:
# Train a model that finds similar items based on the title of the items only
# https://www.datacamp.com/tutorial/recommender-systems-python

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# #Replace NaN with an empty string
# products_eng['title'] = products_eng['title'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(products_eng['title'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(470148, 62117)

In [29]:
item_index_to_tfidf_index = {k: v for v, k in enumerate(products_eng.index)}

In [30]:
tfidf_index_to_item_index = {v: k for k, v in item_index_to_tfidf_index.items()}

In [31]:
import pysparnn.cluster_index as ci

In [32]:
features_vec = tfidf.transform(products_eng['title'])

In [33]:
cp = ci.MultiClusterIndex(features_vec, products_eng['title'])

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)


In [50]:
search_features_vec = tfidf_matrix[2]
# search_features_vec

In [55]:
cp.search(search_features_vec, k=2, k_clusters=2, return_distance=False)

[['500 Easy Recipes for Every Machine Both Stovetop and Electric A Cookbook',
  'An Cookbook']]

In [52]:
products_eng.iloc[2].title

'500 Easy Recipes for Every Machine Both Stovetop and Electric A Cookbook'

### Test

In [56]:
item_index_to_item_commercial_id = dict(zip(products_eng.index, products_eng.id))

In [57]:
item_commercial_id_to_item_index = {v: k for k, v in item_index_to_item_commercial_id.items()}

In [108]:
def recommend_most_similar_for_last_item(item_list):
    
    last_item_commercial_id = item_list[-1]
#     print(last_item_commercial_id)
    last_item_commercial_index = item_commercial_id_to_item_index[last_item_commercial_id]
    
    item_index = item_index_to_tfidf_index[last_item_commercial_index]
    item_vector = tfidf_matrix[item_index]
    
    recommenders = cp.search(item_vector, k=100, k_clusters=2, return_distance=False)
#     print(recommenders)
    recommended_item = recommenders[0][1]
#     print()
    
#     recommended_items = get_closest_item(last_item_commercial_index)
#     recommended_items = item_index_to_item_commercial_id[recommended_items]
    
#     if recommended_items not in comercial_id_to_title:
#         return np.nan
#     title = comercial_id_to_title[recommended_items]
    
    global count
    count += 1
    print(count, end='\r')
    
    return recommended_item

In [109]:
count = 0
sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(recommend_most_similar_for_last_item)

1182181

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions_eng['prev_items'] = sessions_eng['prev_items'].apply(recommend_most_similar_for_last_item)


In [112]:
comercial_id_to_title = dict(zip(products_eng.id, products_eng.title))

In [113]:
def from_commercial_id_to_title(commercial_id):
    
    if commercial_id not in comercial_id_to_title:
        return np.nan
        
    dataframe_row = comercial_id_to_title[commercial_id]
    dataframe_row
    
    return dataframe_row

In [115]:
sessions_eng['next_item'] = sessions_eng['next_item'].apply(from_commercial_id_to_title)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions_eng['next_item'] = sessions_eng['next_item'].apply(from_commercial_id_to_title)


In [121]:
sessions_eng

Unnamed: 0,prev_items,next_item,locale
2090535,ADOV Vegetable Chopper 14 in 1 Multi Function ...,ADOV Vegetable Chopper 14 in 1 Multi Function ...,UK
2090536,BT Home Phone with Nuisance Call Blocking and ...,Bedsure White Voile Sheer Curtains 2 Panels Wi...,UK
2090537,Seven Seas Cod Liver Oil Tablets Plus Multivit...,Seven Seas Omega 3 FISH Oil Plus Cod Liver Oil...,UK
2090538,Rubie s Official Halloween Haunted House Skele...,Rubie s Official Halloween Haunted House Skele...,UK
2090539,Command Decorating Clips Value Pack of 40 Mini...,Command Assorted Picture Hanging Strips White ...,UK
...,...,...,...
3272711,WILLWELL SPORT Money belt RFID Blocking Waterp...,MYCARBON Travel Money Belt RFID Against Invisi...,UK
3272712,Backpack Womens Waterproof School Backpack Gif...,Laptop Backpack Womens 15 6 Inch School Bag wi...,UK
3272713,Regatta unisex child Kids Stormbrk O T Trouser...,Zinc Carbon Triple AAA Batteries 1 Heavy Duty ...,UK
3272714,19 Crimes Cabernet Sauvignon 6 x,El Malbec Argentinian Red Wine 1 x,UK


In [129]:
sessions_eng.dropna(subset=['next_item'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [130]:
sessions_eng.dropna(subset=['prev_items'], inplace=True)

In [117]:
from nltk.translate.bleu_score import corpus_bleu

In [133]:
hypotheses = list(sessions_eng['prev_items'])
hypotheses = [k.split() for k in hypotheses]

In [131]:
list_of_references = list(sessions_eng['next_item'])
list_of_references = [[sent.split()] for sent in list_of_references]

In [134]:
corpus_bleu(list_of_references, hypotheses)

0.29126463627091104