# Importing libraries

In [1]:
#Importing the necessary pacakages
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import scipy.sparse
from numpy import savez_compressed
from numpy import load
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
import warnings
warnings.filterwarnings("ignore")
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import math

# Result visualization for IDF,TF-IDF, Bag of words vectorization

In [3]:
#Utility function for results 
def display_img(url):
    #Get url of the product and download it
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    plt.imshow(img)

def heatmap_image(keys,values,labels,url,text):
    #keys gives the list of words for recommended title
    #divide the figure into two parts
    
    gs = gridspec.GridSpec(1,2,width_ratios = [4,1])
    fg = plt.figure(figsize=(25,3))
    
    #1st figure plotting a heatmap that represents the most commonly occuring words
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.array([values]),annot=np.array([labels]))
    ax.set_xticklabels(keys)
    ax.set_title(text)                 
    
    #2nd figure plotting a heatmap that represents the image of the product
    ln = plt.subplot(gs[1])
    ln.set_xticks([])
    ln.set_yticks([])
    
    fig = display_img(url)
    
    #display combine figure
    plt.show()

def heatmap_image_plot(doc_id,vec1,vec2,url,text,model,tfidf_title_vectorizer,tfidf_title_features,idf_title_vectorizer,idf_title_features):
    
                     
    intersection = set(vec1.keys()) & set(vec2.keys())
    
    #set the value of non intersecting word to zero in vec2                 
    for i in vec2.keys():
        if i not in intersection:
            vec2[i]=0
    #if ith word in intersection(list of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0                 
    values = [vec2[x] for x in vec2.keys()]
    
    #labels for heatmap
    keys = list(vec2.keys())
                     
    if model == 'bag_of_words':
        labels = values
    
    elif model == 'Tfidf':
        labels = []
        for i in vec2.keys():
            if i in tfidf_title_vectorizer.vocabulary_:
                #idf_title_vectorizer.vocabulary contains all the words in the corpus         
                labels.append(tfidf_title_features[doc_id,tfidf_title_vectorizer.vocabulary_[i]])
        
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for i in vec2.keys():
            if i in idf_title_vectorizer.vocabulary_:
                #idf_title_vectorizer.vocabulary contains all the words in the corpus         
                labels.append(idf_title_features[doc_id,idf_title_vectorizer.vocabulary_[i]])
        
            else:
                labels.append(0)
                     
    heatmap_image(keys,values,labels,url,text)
                     
                     
def text_vector(sentence):
    words = sentence.split()    
    return Counter(words)


def results(doc_id,sentence1,sentence2,url,model,tfidf_title_vectorizer,tfidf_title_features,idf_title_vectorizer,idf_title_features):
    vec1 = text_vector(sentence1)
    vec2 = text_vector(sentence2)
                     
    heatmap_image_plot(doc_id,vec1,vec2,url,sentence2,model,tfidf_title_vectorizer,tfidf_title_features,idf_title_vectorizer,idf_title_features)                 

# Result visualization for Avg Word2Vec amd Weighted Word2Vec Vectorization:

In [4]:
#uitlity function to better visualize and understand results

def get_word_vec(sentence,doc_id,model_name,idf_title_vectorizer,idf_title_features):
    #doc_id : index id in vectorized array
    #sentence : title of product
    #model_name : 'avg', we will append the model[i], w2v representation of word i
    
    vec = []
    for i in sentence.split():
        if i in vocab:
            if model_name == 'avg':
                vec.append(modl[i])
            elif model_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
                vec.append(idf_title_features[doc_id,idf_title_vectorizer.vocabulary_[i]] * modl[i] )
        else:
            vec.append(np.zeros(shape=(300,)))
    return np.array(vec)
def get_distance(vec1,vec2):
    # vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
    # vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
    final_dist = []
    for i in vec1:
        dist = []
        for j in vec2:
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
            
    return np.array(final_dist)

def results_Word2Vec(sentence1,sentence2,url,doc_id1,doc_id2,model_name,idf_title_vectorizer,idf_title_features):
    # sentance1 : title1, input product
    # sentance2 : title2, recommended product
    # model:  'avg'

    sentence_vec1 = get_word_vec(sentence1,doc_id1,model_name,idf_title_vectorizer,idf_title_features)
    sentence_vec2 = get_word_vec(sentence2,doc_id2,model_name,idf_title_vectorizer,idf_title_features)
    
    #sent1_sent2_dist = eucledian distance between i and j
    #sent1_sent2_dist = np array with dimensions(#number of words in title1 * #number of words in title2)
    sent1_sent2_dist = get_distance(sentence_vec1,sentence_vec2)
    
    # devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of products
    
    gs = gridspec.GridSpec(1,2,width_ratios=[4,1])
    fg = plt.figure(figsize=(25,25))
    
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.round(sent1_sent2_dist,3), annot = True)
    ax.set_xticklabels(sentence2.split())
    # set the y axis labels as input apparels title
    ax.set_yticklabels(sentence1.split())
    # set title as recommended apparels title
    ax.set_title(sentence2)
    
    #setting the fontsize and rotation of x tick tables
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12,rotation=90)
    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12,rotation=45)
    
    fg = plt.subplot(gs[1])
    fg.set_xticks([])
    fg.set_yticks([])
    fig = display_img(url)
    
    #display combine figure
    plt.show()   

# Functions for Vectorization :

#### Function for IDF vectorization

In [5]:
#define functions for IDF
def containing(word,df):
    #returns the number of documents which have the word
    return sum(1 for sentence in df['title'] if word in sentence.split())
def idf(word,df):
    #return the idf value for a word
    return math.log(df.shape[0]/(containing(word,df)))        

####  Function for Word2Vec vectorization
We perform Word2Vec vectorization in advance to use the vectorized array directly in distance based similarity recommendation as Word2Vec vectorization in computationally intensive.

In [6]:
def avg_word_vec(sentence,no_features,id_,model_name,idf_title_vectorizer,idf_title_features):
    
    # sentence: title of the apparel
    # num_features: the lenght of word2vec vector, its values = 300
    # model_name: model information
    # if  model_name == 'avg', we will add the value model[i], w2v representation of word i
    # if mode_name ='weighted' we will add the value idf_title_features[doc_id,idf_title_vectorizer[word]] * model[word]
    # idf_title_vectorizer : 0 for 'avg' and idf vectorized array for 'weighted'  
    # idf_title_features : 0 for 'avg' and idf vectorized array for 'weighted'
    
    featureVec = np.zeros(shape=(300,), dtype="float32")
    # intialize a vector of size 300 with all zeros
    # add each word2vec(wordi) to this fetureVec

    ncount = 0
    for word in sentence.split():
        ncount += 1
        if word in vocab:
            if model_name == 'avg':
                featureVec = np.add(featureVec,modl[word])
            elif model_name == 'weighted' and word in idf_title_vectorizer.vocabulary_:
                featureVec = np.add(featureVec, modl[word] * idf_title_features[id_,idf_title_vectorizer.vocabulary_[word]])
        if (ncount>0):
            featureVec = np.divide(featureVec,ncount)

    #return avg vec
    return featureVec    

#### Downloading the google Word2Vec library 

In [None]:
#### Downloading Googles Word2Vec library to be used in all word to vec models
# using a pretrained model by google
# download "GoogleNews-vectors-negative300.bin" 

modl = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# vocab = stores all the words in google Word2vec model
vocab = modl.index_to_key

# Vectorization : 

In [1]:
def Vectorization(data,model):
    #data : Data set containing text data
    #model : method used for text vectorization
    
    if model == 'bag_of_words':
        #Vectorization using Bag of words
        title_vectorizer = CountVectorizer()
        title_features = title_vectorizer.fit_transform(data['title'])
        
        return title_features,title_vectorizer
    
    elif model == 'Tfidf':
        #Vectorization using tfidfVectorizer
        tfidf_title_vectorizer = TfidfVectorizer()
        tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])
        
        return tfidf_title_features,tfidf_title_vectorizer
    
    elif model == 'idf':
        #Vectorization using idf function
        idf_title_vectorizer = CountVectorizer()
        idf_title_features = idf_title_vectorizer.fit_transform(data['title'])
        # idf_title_features.shape = #no of_data_points * #words_corpus
        # CountVectorizer().fit_transform(courpus) returns the a sparase matrix of dimensions #data_points * #words_in_corpus
        
        #converting all the values into float
        idf_title_features = idf_title_features.astype(np.float)
        
        #assigning df value for idf[value] function
        df = data
        
        for i in idf_title_vectorizer.vocabulary_.keys():
            idf_value = idf(i,df)
            #j is the index of the nonzero values
            for j in idf_title_features[:,idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
                idf_title_features[j,idf_title_vectorizer.vocabulary_[i]] = idf_value
        
        scipy.sparse.save_npz('Pickle/idf_title_features.npz', idf_title_features)
        
        return idf_title_features,idf_title_vectorizer
    
    elif model == 'avg':
        w2vec_title_features = []
        #building vector for each title 
        for i in data['title']:
            w2vec_title_features.append(avg_word_vec(i,300,0,'avg',idf_title_vectorizer=0,idf_title_features=0))

        #w2v_title_features = np.array(# number of doc/rows in courpus * 300) 
        Word2Vec_features = np.array(w2vec_title_features)
        
        #saving dataframe in a npz file
        savez_compressed("Pickle/Word2Vec_aveg.npz",Word2Vec_features)
        
        return w2vec_title_features
    
    elif model == 'weighted':
        
        #Load the saved idf vectorized sparse array .npz
        #title_features= Vectorization(data,'idf')
        idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above
        
        #to get the words in columns implemeny count vectorizers
        idf_title_vectorizer = CountVectorizer()
        vectorizer = idf_title_vectorizer.fit_transform(data['title'])
        
        id_ = 0 
        w2vec_title_weight = []
        
        #building vector for each title
        for i in data['title']:
            w2vec_title_weight.append(avg_word_vec(i,300,id_,'weighted',idf_title_vectorizer = idf_title_vectorizer ,idf_title_features = idf_title_features))
            id_ += 1

        #w2v_title_weight = np.array(# number of doc/rows in courpus * 300) 
        w2vec_title_weight = np.array(w2vec_title_weight)
        
        #saving dataframe in a npz file
        savez_compressed("Pickle/Word2Vec_weighted.npz",w2vec_title_weight)
        
        return w2vec_title_weight

# Recommendations

In [1]:
def distance_similarity(doc_id,data,model,cut_off):
    #data : data contaning text for vectorization 
    #model : method used for text vectorization
    #Cut_off : the number of recommendations we give out
    #Vector_array = loaded  Word2Vec vectorized numpy array from stored vectorization matrix saved in CSV file 
    
    if model == 'bag_of_words': 
        #storing array after vectorization 
        title_features,title_vectorizer = Vectorization(data,model)

        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distances saves the distance between given input product and all other products
        pairwise_dist = pairwise_distances(title_features,title_features[doc_id],metric = 'cosine')

        #np.argsort returns indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results(indices[i], data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], model,tfidf_title_vectorizer=0,tfidf_title_features=0,idf_title_vectorizer=0,idf_title_features=0)
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))
            
    elif model == 'Tfidf':
        #storing array after vectorization 
        tfidf_title_features,tfidf_title_vectorizer = Vectorization(data,model)

        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distance saves the distance between given input product and all other products
        pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id],metric = 'cosine')

        #np.argsort returns indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results(indices[i], data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], model,tfidf_title_vectorizer=tfidf_title_vectorizer,tfidf_title_features=tfidf_title_features,idf_title_vectorizer=0,idf_title_features=0)
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))
        
    elif model == 'idf':
        #do not use vectorizer as it is computationally expensive to vectorize everytime
        #Load the saved vectorized sparse array .npz
        #title_features= Vectorization(data,'idf')
        idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above
        
        #to get the words in columns implemeny count vectorizers
        idf_title_vectorizer = CountVectorizer()
        vectorizer = idf_title_vectorizer.fit_transform(data['title'])

        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distance will save the distance between given input product and all other products
        pairwise_dist = pairwise_distances(idf_title_features,idf_title_features[doc_id],metric = 'cosine')

        #np.argsort will return indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results(indices[i], data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], model, tfidf_title_vectorizer=0, tfidf_title_features=0, idf_title_vectorizer = idf_title_vectorizer, idf_title_features = idf_title_features)
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))
    
    
    elif model == 'avg':
        #Word2Vec_features = Vectorization(data['title'],'avg')
        #do not use vectorizer as it is computationally expensive to vectorize everytime 
        #Load the stored vectorized array .npz
        Word2Vec_features = load("Pickle/Word2Vec_aveg.npz") 
        
        #uncompresing npz to numpy array array
        Word2Vec_features  = Word2Vec_features['arr_0']
        
        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distance will save the distance between given input product and all other products
        pairwise_dist = pairwise_distances(Word2Vec_features,Word2Vec_features[doc_id].reshape(1,-1))

        #np.argsort will return indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results_Word2Vec(data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], indices[0], indices[i],model,idf_title_vectorizer = 0,idf_title_features = 0)
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))
    
    elif model == 'weighted':
        #do not use vectorizer as it is computationally expensive to vectorize everytime
        #Load the saved vectorized sparse array .npz
        #title_features= Vectorization(data,'weighted')
        idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above
        
        #to get the words in columns CountVectorizer
        idf_title_vectorizer = CountVectorizer()
        vectorizer = idf_title_vectorizer.fit_transform(data['title'])
        
        #Word2Vec_features = Vectorization(data['title'],'avg')
        #do not use vectorizer as it is computationally expensive to vectorize everytime 
        #Load the stored vectorized array .npz
        Word2Vec_features = load("Pickle/Word2Vec_aveg.npz") #OR we can Vectorize using the code above
        
        #uncompresing npz to numpy array array
        Word2Vec_feature  = Word2Vec_features['arr_0']
        
        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distance will save the distance between given input product and all other products
        pairwise_dist = pairwise_distances( Word2Vec_feature, Word2Vec_feature[doc_id].reshape(1,-1))

        #np.argsort will return indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results_Word2Vec(data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], indices[0], indices[i],model,idf_title_vectorizer = idf_title_vectorizer,idf_title_features= idf_title_features)
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))