In [25]:
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
import warnings
warnings.filterwarnings("ignore")


# Result visualization for IDF,TF-IDF, Bag of words vectorization

In [26]:
#Utility function for results 
def display_img(url):
    #we get the url of the product and download it
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    plt.imshow(img)

def heatmap_image(keys,values,labels,url,text):
    #keys gives us the list of words for recommended title
    #we will divide the figure into two parts
    
    #we will divide the figure into two parts
    gs = gridspec.GridSpec(1,2,width_ratios = [4,1])
    fg = plt.figure(figsize=(25,3))
    
    #1st figure plotting a heatmap that represents the most commonly occuring words
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.array([values]),annot=np.array([labels]))
    ax.set_xticklabels(keys)
    ax.set_title(text)                 
    
    #2nd figure plotting a heatmap that represents the most commonly occuring words
    ln = plt.subplot(gs[1])
    ln.set_xticks([])
    ln.set_yticks([])
    
    fig = display_img(url)
    
    #display combine figure
    plt.show()

def heatmap_image_plot(doc_id,vec1,vec2,url,text,model):
    
                     
    intersection = set(vec1.keys()) & set(vec2.keys())
    
    #we set the value of non intersecting word to zero in vec2                 
    for i in vec2.keys():
        if i not in intersection:
            vec2[i]=0
    #if ith word in intersection(list of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0                 
    values = [vec2[x] for x in vec2.keys()]
    
    #labels for heatmap
    keys = list(vec2.keys())
                     
    if model == 'bag_of_words':
        labels = values
    
    elif model == 'Tfidf':
        labels = []
        for i in vec2.keys():
            if i in tfidf_title_vectorizer.vocabulary_:
                #idf_title_vectorizer.vocabulary contains all the words in the corpus         
                labels.append(tfidf_title_features[doc_id,tfidf_title_vectorizer.vocabulary_[i]])
        
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for i in vec2.keys():
            if i in idf_title_vectorizer.vocabulary_:
                #idf_title_vectorizer.vocabulary contains all the words in the corpus         
                labels.append(idf_title_features[doc_id,idf_title_vectorizer.vocabulary_[i]])
        
            else:
                labels.append(0)
                     
    heatmap_image(keys,values,labels,url,text)
                     
                     
def text_vector(sentence):
    words = sentence.split()    
    return Counter(words)


def results(doc_id,sentence1,sentence2,url,model):
    vec1 = text_vector(sentence1)
    vec2 = text_vector(sentence2)
                     
    heatmap_image_plot(doc_id,vec1,vec2,url,sentence2,model)                 

# Result visualization for Avg Word2Vec amd Weighted Word2Vec Vectorization:

In [27]:
#uitlity function to better visualize and understand results

def get_word_vec(sentence,doc_id,model_name):
    #doc_id in our corpus
    #sentence : title of product
    #model_name : 'avg', we will append the model[i], w2v representation of word i
    
    vec = []
    for i in sentence.split():
        if i in vocab:
            if model_name == 'avg':
                vec.append(model[i])
            elif model_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
                vec.append(idf_title_features[doc_id,idf_title_vectorizer.vocabulary_[i]] * model[i] )
        else:
            vec.append(np.zeros(shape=(300,)))
    return np.array(vec)
def get_distance(vec1,vec2):
    # vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
    # vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
    final_dist = []
    for i in vec1:
        dist = []
        for j in vec2:
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
            
    return np.array(final_dist)
def results_Word2Vec(sentence1,sentence2,url,doc_id1,doc_id2,model_name):
    # sentance1 : title1, input product
    # sentance2 : title2, recommended product
    # model:  'avg'

    sentence_vec1 = get_word_vec(sentence1,doc_id1,model_name)
    sentence_vec2 = get_word_vec(sentence2,doc_id2,model_name)
    
    #sent1_sent2_dist = eucledian distance between i and j
    #sent1_sent2_dist = np array with dimensions(#number of words in title1 * #number of words in title2)
    sent1_sent2_dist = get_distance(sentence_vec1,sentence_vec2)
    
    # devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of products
    
    gs = gridspec.GridSpec(1,2,width_ratios=[4,1])
    fg = plt.figure(figsize=(35,25))
    
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.round(sent1_sent2_dist,), annot = True)
    ax.set_xticklabels(sentence2.split())
    # set the y axis labels as input apparels title
    ax.set_yticklabels(sentence1.split())
    # set title as recommended apparels title
    ax.set_title(sentence2)
    
    #setting the fontsize and rotation of x tick tables
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12,rotation=90)
    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12,rotation=45)
    
    fg = plt.subplot(gs[1])
    fg.set_xticks([])
    fg.set_yticks([])
    fig = display_img(url)
    
    #display combine figure
    plt.show()   

# Vectorization

In [28]:
#define functions for IDF
def containing(word):
    #returns the number of documents which have the word
    return sum(1 for sentence in data['title'] if word in sentence.split())
def idf(word):
    return math.log(data.shape[0] / (containing(word)))        

In [38]:
def Vectorization(data,model):
    
    if model == 'bag_of_words':
        #Vectorization using Bag of words
        title_vectorizer = CountVectorizer()
        title_features = title_vectorizer.fit_transform(data)
        
        return title_features
    
    elif model == 'Tfidf':
        tfidf_title_vectorizer = TfidfVectorizer()
        tfidf_title_features = tfidf_title_vectorizer.fit_transform(data)
        
        return tfidf_title_features

    
    elif mode == 'idf':
        
        idf_title_vectorizer = CountVectorizer()
        idf_title_features = idf_title_vectorizer.fit_transform(data)
        # idf_title_features.shape = #no of_data_points * #words_corpus
        # CountVectorizer().fit_transform(courpus) returns the a sparase matrix of dimensions #data_points * #words_in_corpus
        
        #converting all the values into float
        idf_title_features.astype(np.float)
        
        for i in idf_title_vectorizer.vocabulary_:
            idf_value = idf(i)
            
            #j is the index of the nonzero values
            for j in idf_title_features[:,idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
                idf_title_features[j,idf_title_vectorizer.vocabulary_[i]]= idf_value
                
        return idf_title_features            

In [40]:
def distance_similarity(doc_id,data,model,cut_off):
    
    if model == 'bag_of_words' or 'Tfidf' or 'idf':
        #storing array after vectorization 
        idf_title_features = Vectorization(data['title'],model)

        #doc_id is the number on the new index formed after CountVectorizer is applied to the data['title']
        #pairwise distance will save the distance between given input product and all other products
        pairwise_dist = pairwise_distances(idf_title_features,idf_title_features[doc_id])

        #np.argsort will return indices of the smallest distances
        indices = np.argsort(pairwise_dist.flatten())[:cut_off]

        #get the index of the original dataframe
        data_indices = list(data.index[indices])

        for i in range(0,len(data_indices)):
            results(indices[i], data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], 'bag_of_words')
            print('The amazon ID of the apparel is {}'.format(data['asin'].loc[data_indices[i]]))
