# Importing Libraries and Storing Googles Word2vec libray

In [1]:
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")

###############################################################################################################################

#Downloading Googles Word2Vec library to be used in all word to vec models using a pretrained model by google
#download "GoogleNews-vectors-negative300.bin" 
modl = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#vocab = stores all the words in google Word2vec model
vocab = modl.index_to_key



# Result Visualization
#### TF-IDF, Bag of words 

In [None]:
def heatmap_image(keys,values,labels,text):
    #keys gives the list of words for recommended title
    #divide the figure into two parts

    #plotinng a heatmap that represents the most commonly occuring words
    plt.figure(figsize=(30,10))

    #plotinng a heatmap that represents the most commonly occuring words
    ax = sns.heatmap(np.array([values]),annot = np.array([labels]))
    ax.set_xticklabels(keys)
    ax.set_title(text,fontsize=18)
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 18,rotation=45)

    #displays figure
    plt.show()


def heatmap_image_plot(doc_id,vec1,vec2,text,model,tfidf_title_vectorizer,tfidf_title_features):


    intersection = set(vec1.keys()) & set(vec2.keys())

    #set the value of non intersecting word to zero in vec2                 
    for i in vec2.keys():
        if i not in intersection:
            vec2[i]=0
    #if ith word in intersection(list of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0                 
    values = [vec2[x] for x in vec2.keys()]

    #labels for heatmap
    keys = list(vec2.keys())

    if model == 'bag_of_words':
        labels = values

    elif model == 'Tfidf':
        labels = []
        for i in vec2.keys():
            if i in tfidf_title_vectorizer.vocabulary_:
                #idf_title_vectorizer.vocabulary contains all the words in the corpus         
                labels.append(tfidf_title_features[doc_id,tfidf_title_vectorizer.vocabulary_[i]])

            else:
                labels.append(0)
    
    heatmap_image(keys,values,labels,text)


def text_vector(sentence):
    words = sentence.split()    
    return Counter(words)


def visualization(doc_id,sentence1,sentence2,model,tfidf_title_vectorizer,tfidf_title_features):
    vec1 = text_vector(sentence1)
    vec2 = text_vector(sentence2)

    heatmap_image_plot(doc_id,vec1,vec2,sentence2,model,tfidf_title_vectorizer,tfidf_title_features)                 

#### Avg Word2Vec 

In [None]:
def get_word_vec(sentence,doc_id):
    #doc_id : index id in vectorized array
    #sentence : title of product
    
    vec = []
    for i in sentence.split():
        if i in vocab:
                vec.append(modl[i])
        else:
            vec.append(np.zeros(shape=(300,)))
    return np.array(vec)

def get_distance(vec1,vec2):
    # vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
    # vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
    final_dist = []
    for i in vec1:
        dist = []
        for j in vec2:
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
            
    return np.array(final_dist)

def results_Word2Vec(sentence1,sentence2,doc_id1,doc_id2):
    # sentence1 : title1, input product
    # sentence2 : title2, recommended product
   
    sentence_vec1 = get_word_vec(sentence1,doc_id1)
    sentence_vec2 = get_word_vec(sentence2,doc_id2)
    
    #sent1_sent2_dist = eucledian distance between i and j
    #sent1_sent2_dist = np array with dimensions(#number of words in title1 * #number of words in title2)
    sent1_sent2_dist = get_distance(sentence_vec1,sentence_vec2)
    
    # devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of products
    
    ax = plt.figure(figsize=(15,15))
    
    ax = sns.heatmap(np.round(sent1_sent2_dist,3), annot = True)
    ax.set_xticklabels(sentence2.split())
    # set the y axis labels as input apparels title
    ax.set_yticklabels(sentence1.split())
    # set title as recommended apparels title
    ax.set_title(sentence2)
    
    #setting the fontsize and rotation of x tick tables
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12,rotation=90)
    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12,rotation=45)
    
    #display combine figure
    plt.show()   