###### Getting Started: Loading Libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading the Dataset


In [None]:
df_j = pd.read_csv('process.csv')


We have our dataframe ready, so let`s visualize it

In [None]:
# display the the five first rows
df_j.head()

Unnamed: 0,id,Name,Brand,Product_Type,Search_Price,Gender,AW_Deep_Link,Merchant_Image,Color
0,65187,Bauchnabelpiercing Strass Kreuz mit Gothic Flügel,Crysal Jewelry,Bauchnabel Piercing,15.0,Unisex,https://t.adcell.com/p/click?promoId=153907&sl...,https://cdn.shopify.com/s/files/1/0525/0809/87...,silber
1,65186,Bauchnabelpiercing aus Chirurgenstahl mit 925e...,Crysal Jewelry,Bauchnabel Piercing,13.5,Unisex,https://t.adcell.com/p/click?promoId=153907&sl...,https://cdn.shopify.com/s/files/1/0525/0809/87...,weiß
2,65185,Bauchnabelpiercing Titan Zirkonia CZ 925er Sil...,Crysal Jewelry,Bauchnabel Piercing,14.99,Unisex,https://t.adcell.com/p/click?promoId=153907&sl...,https://cdn.shopify.com/s/files/1/0525/0809/87...,weiß
3,65184,Bauchnabelpiercing Bioflex PTFE flexibel Schwa...,Crysal Jewelry,Bauchnabel Piercing,3.99,Herren,https://t.adcell.com/p/click?promoId=153907&sl...,https://cdn.shopify.com/s/files/1/0525/0809/87...,gold
4,65183,Elegantes Bauchnabelpiercing Anhänger mit *Zir...,Crysal Jewelry,Bauchnabel Piercing,7.99,Unisex,https://t.adcell.com/p/click?promoId=153907&sl...,https://cdn.shopify.com/s/files/1/0525/0809/87...,weiß


In [None]:
# check the data type and if there is any missing value
df_j.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23966 entries, 0 to 23965
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              23966 non-null  int64  
 1   Name            23966 non-null  object 
 2   Brand           23966 non-null  object 
 3   Product_Type    23948 non-null  object 
 4   Search_Price    23966 non-null  float64
 5   Gender          23966 non-null  object 
 6   AW_Deep_Link    23966 non-null  object 
 7   Merchant_Image  23962 non-null  object 
 8   Color           23954 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 1.6+ MB


In [None]:
# print all columns of the dataset
print(df_j.columns.values)

['id' 'Name' 'Brand' 'Product_Type' 'Search_Price' 'Gender' 'AW_Deep_Link'
 'Merchant_Image' 'Color']


### Onvisualizing the dataset, you may have noticed that it has many extra info about the data. We don’t need all of them. So, we choose keywords, cast, genres, director and title column to use as our feature set.

### for this project we decide to use some columns instead of all the columns 

In [6]:
## cols to be used for further analysis

df_j = df_j[['Name','Brand','Product_Type','Color']]

In [7]:
df_j.head(2)


Unnamed: 0,Name,Brand,Product_Type,Color
0,Bauchnabelpiercing Strass Kreuz mit Gothic Flügel,Crysal Jewelry,Bauchnabel Piercing,silber
1,Bauchnabelpiercing aus Chirurgenstahl mit 925e...,Crysal Jewelry,Bauchnabel Piercing,weiß


In [8]:
# check missing data 
df_j.isnull().sum()

Name             0
Brand            0
Product_Type    18
Color           12
dtype: int64

As you may can noticed that some columns have NaN data points that will create a problem for us, so what we will do is instead of NaN values we will replace it with empty string ('').

In [9]:

# remove nan value
df_j.dropna(inplace = True)

In [10]:
# check duplicated 
df_j.duplicated().sum()


9805

Let combine  some  columns 


In [11]:
df_j.columns

Index(['Name', 'Brand', 'Product_Type', 'Color'], dtype='object')

In [16]:
def combine_features(row):
    return row['Name']+' '+row['Brand']+' '+row['Product_Type']+'  '+row['Color']

Now, we need to call this function over each row of our dataframe. 

applying combine_feature method over each row of Dataframe and storing the combined string in "combined_features" column

In [17]:
df_j['combined_features'] = df_j.apply(combine_features, axis = 1)

In [18]:
print(df_j.loc[0, 'combined_features'])

Bauchnabelpiercing Strass Kreuz mit Gothic Flügel Crysal Jewelry Bauchnabel Piercing  silber


In [19]:
df_j.head(2)

Unnamed: 0,Name,Brand,Product_Type,Color,combined_features
0,Bauchnabelpiercing Strass Kreuz mit Gothic Flügel,Crysal Jewelry,Bauchnabel Piercing,silber,Bauchnabelpiercing Strass Kreuz mit Gothic Flü...
1,Bauchnabelpiercing aus Chirurgenstahl mit 925e...,Crysal Jewelry,Bauchnabel Piercing,weiß,Bauchnabelpiercing aus Chirurgenstahl mit 925e...



Now that we have obtained the combined strings, we can now feed these strings to a CountVectorizer() object for getting the count matrix.

In [20]:
##Convert columns values in lowercase
df_j['Name']=df_j['Name'].str.lower()
df_j['Brand']=df_j['Brand'].str.lower()
df_j['Product_Type']=df_j['Product_Type'].str.lower()
df_j['Color']=df_j['Color'].str.lower()
df_j['combined_features']=df_j['combined_features'].str.lower()

In [21]:
df_j.head(2)

Unnamed: 0,Name,Brand,Product_Type,Color,combined_features
0,bauchnabelpiercing strass kreuz mit gothic flügel,crysal jewelry,bauchnabel piercing,silber,bauchnabelpiercing strass kreuz mit gothic flü...
1,bauchnabelpiercing aus chirurgenstahl mit 925e...,crysal jewelry,bauchnabel piercing,weiß,bauchnabelpiercing aus chirurgenstahl mit 925e...


In [22]:
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics.pairwise import cosine_similarity
import re




In [23]:
import string
#REMOVE PUNCTUATIONS
def rem_punctuations(text):
    for punctuation in string.punctuation: #string.punctuation gives all set of punctuations
        text = text.replace(punctuation, '')
    return text

df_j['combined_features'] = df_j['combined_features'].apply(rem_punctuations)


In [24]:
df_j.columns

Index(['Name', 'Brand', 'Product_Type', 'Color', 'combined_features'], dtype='object')

In [25]:
# REMOVE NON ASCII CHARACTERS
def remove_non_ascii(string):
    return "".join(c for c in string if ord(c) < 128)

df_j['Name'] = df_j['Name'].apply(remove_non_ascii)
df_j['Brand'] = df_j['Brand'].apply(remove_non_ascii)
df_j['Product_Type'] = df_j['Product_Type'].apply(remove_non_ascii)
df_j['Color'] = df_j['Color'].apply(remove_non_ascii)
df_j['combined_features'] = df_j['combined_features'].apply(remove_non_ascii)

# REMOVE HTML CODES
def rem_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df_j['Name'] = df_j['Name'].apply(rem_html)
df_j['Brand'] = df_j['Brand'].apply(rem_html)
df_j['Product_Type'] = df_j['Product_Type'].apply(rem_html)
df_j['Color'] = df_j['Color'].apply(rem_html)
df_j['combined_features'] = df_j['combined_features'].apply(rem_html)


In [26]:
from nltk.corpus import stopwords
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [27]:
#remove stopwords
from nltk.corpus import stopwords

def rem_stopwords(text):
    return " ".join(word for word in str(text).split() if word not in set(stopwords.words('german')))


df_j['Name'] = df_j['Name'].apply(rem_stopwords)
df_j['Brand'] = df_j['Brand'].apply(rem_stopwords)
df_j['Product_Type'] = df_j['Product_Type'].apply(rem_stopwords)
df_j['combined_features'] = df_j['combined_features'].apply(rem_stopwords)


In [28]:
import nltk
nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('omw-1.4')

#from collections import Counter 

#from nltk.corpus import wordnet # To get words in dictionary with their parts of speech
from nltk.stem import WordNetLemmatizer

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [94]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akwow\AppData\Roaming\nltk_data...


True

In [29]:
wln = WordNetLemmatizer()

def lem_words(text):
    return " ".join(wln.lemmatize(word) for word in text.split())


df_j['Name'] = df_j['Name'].apply(lem_words)
df_j['Brand'] = df_j['Brand'].apply(lem_words)
df_j['Product_Type'] = df_j['Product_Type'].apply(lem_words)
df_j['Color'] = df_j['Color'].apply(lem_words)
df_j['combined_features'] = df_j['combined_features'].apply(lem_words)

In [30]:

df_j['Name'] = df_j['Name'].str.strip()
df_j['Brand'] = df_j['Brand'].str.strip()
df_j['Product_Type'] = df_j['Product_Type'].str.strip()
df_j['Color'] = df_j['Product_Type'].str.strip()
df_j['combined_features'] = df_j['combined_features'].str.strip()


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string
def tostr(text):
    return ''.join(text)
df_j['Name'] = df_j['Name'].apply(tostr)
df_j['Brand'] = df_j['Brand'].apply(tostr)
df_j['Product_Type'] = df_j['Product_Type'].apply(tostr)
df_j['Color'] = df_j['Color'].apply(tostr)
df_j['combined_features'] = df_j['combined_features'].apply(tostr)
print(df_j['combined_features'][0])
type(df_j['combined_features'][0])

bauchnabelpiercing strass kreuz gothic flgel crysal jewelry bauchnabel piercing silber


str

In [102]:
#fetch category inside function- working recommend function
def recommend(name):
    name= name.lower()
    
    #Fetch the category of our title
    titlerow = df_j.loc[df_j['Name']== name].iloc[0]
    category=titlerow['Brand']

    # MATCH THE CATEGORY WITH THE COLUMN "CATEGORIES" OF THE DATASET
    data = df_j.loc[df_j['Brand'] == category].copy()

    if len(data)<=10: ##As our dataset is unbalanced, if the matching category contains less than 10 book titles
      data=df_j.copy()          #,then we ommit the category filtering


    # RESET INDEX
    data.reset_index(level = 0, inplace = True, drop=True) 
    
    # INDEX TO A PANDAS SERIES
    indices = pd.Series(data.index, index = data['Name'])
    
    # CONVERT THE BOOK TITLE INTO VECTORS AND USE BIGRAM
    tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 1, stop_words='english',sublinear_tf=True)
    
    # tfidf_matrix = tf.fit_transform(data['title'])
    #sublinear_tfbool, default=False
    #Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)
    
    tfidf_matrix = tf.fit_transform(data['combined_features'])
    
    # CALCULATE THE SIMILARITY MEASURE
    similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # GET THE INDEX OF ORIGINAL TITLE
    title_index = indices[Brand].tolist()
    print("title_index",title_index)
    if not(type(title_index) is int): 
        
        title_index=title_index[0]    #if more than one matching index exists, take the 1st one
        inds=indices[Brand].tolist()
        for i in inds:   #to drop other rows with the same title
          if i!=title_index:
            print('index dropped:',i)
            data.drop(i,inplace=True)
    
    # PAIRWISE SIMILARITY SCORES
    similarity = list(enumerate(similarity[title_index]))
    
    # SORT THE BOOKS
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)

    # GET TOP 10 MOST SIMILAR BOOKS
    similarity  = similarity [1:11]
    print("similarity:",similarity,"\n")
    book_indices = [i[0] for i in similarity]

    #Weighted Rating method
    top10_rated = data['Search_Price'].iloc[book_indices]
    wsort = top10_rated.sort_values(ascending = False)
    wsort_top5 = wsort[:6]
    wsort_top5.to_frame()

    # INDICES OF TOP 5
    wsort_indices = wsort_top5.index
    #print(wsort_indices)

    # TOP 5 RECOMMENDATION
    rec = data[['Brand']].iloc[wsort_indices]
    
    # PRINT THE BOOKS TITLE
    print("\n","The recommendations are:")
    print(rec['Brand'])

In [32]:
df_j.head(2)

Unnamed: 0,Name,Brand,Product_Type,Color,combined_features
0,bauchnabelpiercing strass kreuz gothic flgel,crysal jewelry,bauchnabel piercing,silber,bauchnabelpiercing strass kreuz gothic flgel c...
1,bauchnabelpiercing chirurgenstahl 925er silber...,crysal jewelry,bauchnabel piercing,wei,bauchnabelpiercing chirurgenstahl 925er silber...


In [103]:
recommend("bauchnabel piercing")

IndexError: single positional indexer is out-of-bounds

In [33]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df_j['combined_features'])

Now, we need to obtain the cosine similarity matrix from the count matrix.

In [34]:
cosine_sim = cosine_similarity(count_matrix)

Now, we will define two helper functions to get dataset  brand  from dataset  index and vice-versa.

In [35]:
cols=df_j.columns

In [48]:
df_j.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            23956, 23957, 23958, 23959, 23960, 23961, 23962, 23963, 23964,
            23965],
           dtype='int64', length=23936)

In [36]:
def get_title_from_index(index):
    return df_j[df_j.index == index]["Name"].values[0]
def get_index_from_brand(Name):
    return df_j[df_j.Name == Name].index.values[0]


In [47]:
df_j[df_j.Name == 'Bauchnabelpiercing Strass Kreuz mit Gothic Flügel'].index.values

array([], dtype=int64)

Our next step is to get the title of the movie that the user currently likes. Then we will find the index of that movie. After that, we will access the row corresponding to this movie in the similarity matrix. Thus, we will get the similarity scores of all other movies from the current movie. Then we will enumerate through all the similarity scores of that movie to make a tuple of movie index and similarity score. This will convert a row of similarity scores like this- [1 0.5 0.2 0.9] to this- [(0, 1) (1, 0.5) (2, 0.2) (3, 0.9)] . Here, each item is in this form- (movie index, similarity score)

In [39]:
df_j.head(2)

Unnamed: 0,Name,Brand,Product_Type,Color,combined_features
0,bauchnabelpiercing strass kreuz gothic flgel,crysal jewelry,bauchnabel piercing,silber,bauchnabelpiercing strass kreuz gothic flgel c...
1,bauchnabelpiercing chirurgenstahl 925er silber...,crysal jewelry,bauchnabel piercing,wei,bauchnabelpiercing chirurgenstahl 925er silber...


In [50]:
jewlry_user_likes = "bauchnabelpiercing strass kreuz gothic flgel"
df_j_brand = get_index_from_brand(jewlry_user_likes)
similar_jewlry = list(enumerate(cosine_sim[df_j_brand])) #accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it

We will sort the list similar_movies according to similarity scores in descending order. Since the most similar movie to a given movie will be itself, we will discard the first element after sorting the movies.

In [51]:
sorted_similar_movies = sorted(similar_jewlry,key=lambda x:x[1],reverse=True)[1:]

Then, we will run a loop to print first `10 entries from sorted_similar_movies list.

In [53]:
i=0
print("Top 10 similar jewlry  to "+jewlry_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break

Top 10 similar jewlry  to bauchnabelpiercing strass kreuz gothic flgel are:

reverse bauchnabelpiercing gothic kreuz kristallstein
bauchnabelpiercing fantasy kreuz flgel kristall chirurgenstahl
bauchnabelpiercing bunter schmetterling c316l
reverse bauchnabelpiercing tribal-style zirkonia
bauchnabelpiercing tribal-style zirkonia steinchen
bauchnabel piercing "love" herz strass *zirkonia stein
bauchnabelpiercing tribal gothic kreuz schwarzer kristall stein c316l
bauchnabel piercing 3 schmetterlingen chirurgenstahl
reverse bauchnabelpiercing schnem schmetterling butterfly
bauchnabelpiercing gepunkteter glitzer schmetterling c316l
bauchnabelpiercing tribal motive zirkonia steinchen
