# Importing libraries

In [1]:
#Importing the necessary pacakages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import scipy.sparse
from numpy import savez_compressed
from numpy import load
import warnings
warnings.filterwarnings("ignore")
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import math
from ipynb.fs.full.Visualization import *

################################################################################################################################

#Downloading Googles Word2Vec library to be used in all word to vec models using a pretrained model by google
#download "GoogleNews-vectors-negative300.bin" 
modl = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#vocab = stores all the words in google Word2vec model
vocab = modl.index_to_key

# Functions for Vectorization :

In [95]:
#define additional functions needed for IDF vectorization
def containing(word,df):
    #returns the number of documents which have the word
    return sum(1 for sentence in df['title'] if word in sentence.split())
def idf(word,df):
    #return the idf value for a word
    return math.log(df.shape[0]/(containing(word,df)))        

################################################################################################################################
#define additional functions needed for avg and weighted Word2Vec vectorization
#Function for Word2Vec vectorization
#perform Word2Vec vectorization in advance to use the vectorized array directly in distance based similarity recommendation
#as performing Word2Vec vectorization each time is computationally intensive compared to Bag of words and idf based vectorization.

def avg_word_vec(sentence,no_features,id_,model_name,idf_title_vectorizer,idf_title_features):
    
    # sentence: title of the apparel
    # num_features: the lenght of word2vec vector, its values = 300
    # model_name: model information
    # if  model_name == 'avg', add the value model[i], w2v representation of word i
    # if mode_name ='weighted' add the value idf_title_features[doc_id,idf_title_vectorizer[word]] * model[word]
    # idf_title_vectorizer : 0 for 'avg' and idf vectorized array for 'weighted'  
    # idf_title_features : 0 for 'avg' and idf vectorized array for 'weighted'
    
    featureVec = np.zeros(shape=(300,), dtype="float32")
    # initialize a vector of size 300 with all zeros
    # add each word2vec(wordi) to this fetureVec

    ncount = 0
    for word in sentence.split():
        ncount += 1
        if word in vocab:
            if model_name == 'avg':
                featureVec = np.add(featureVec,modl[word])
            elif model_name == 'weighted' and word in idf_title_vectorizer.vocabulary_:
                featureVec = np.add(featureVec, modl[word] * idf_title_features[id_,idf_title_vectorizer.vocabulary_[word]])
        if (ncount>0):
            featureVec = np.divide(featureVec,ncount)

    #return avg vec
    return featureVec    

# Class Results 

In [99]:
class results():
    
    def __init__(self,doc_id,model,data,cut_off):        
        
        #initializing the movie for recommendation
        self.doc_id = doc_id
        
        #initialzing the model to be used
        self.model = model
        
        #initialzing the data to be modelled
        self.data = data
       
        #the number of recommendations we require
        self.cut_off = cut_off
        
    def Vectorization(self):
        #data : Data set containing text data
        #model : method used for text vectorization

        if self.model == 'bag_of_words':
            #Vectorization using Bag of words
            title_vectorizer = CountVectorizer()
            title_features = title_vectorizer.fit_transform(self.data['title'])   
            return title_features,title_vectorizer

        elif self.model == 'Tfidf':
            #Vectorization using tfidfVectorizer
            tfidf_title_vectorizer = TfidfVectorizer()
            tfidf_title_features = tfidf_title_vectorizer.fit_transform(self.data['title'])
            return tfidf_title_features,tfidf_title_vectorizer
        
        elif self.model == 'idf':
            #Vectorization using idf function
            idf_title_vectorizer = CountVectorizer()
            idf_title_features = idf_title_vectorizer.fit_transform(self.data['title'])
            
            #converting all the values into float
            idf_title_features = idf_title_features.astype(np.float)

            #assigning df value for idf[value] function
            df = self.data

            for i in idf_title_vectorizer.vocabulary_.keys():
                idf_value = idf(i,df)
                #j is the index of the nonzero values
                for j in idf_title_features[:,idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
                    idf_title_features[j,idf_title_vectorizer.vocabulary_[i]] = idf_value
        
            scipy.sparse.save_npz('Pickle/idf_title_features.npz', idf_title_features)

            return idf_title_features,idf_title_vectorizer
        
        elif self.model == 'avg':
            w2vec_title_features = []
            #building vector for each title 
            for i in self.data['title']:
                w2vec_title_features.append(avg_word_vec(i,300))

            #w2v_title_features = np.array(# number of doc/rows in courpus * 300) 
            Word2Vec_features = np.array(w2vec_title_features)

            #saving dataframe in a npz file
            savez_compressed("Pickle/Word2Vec_aveg.npz",Word2Vec_features)
            
            return Word2Vec_features
        
        elif self.model == 'weighted':
            #Load the saved idf vectorized sparse array .npz
            #title_features= Vectorization(data,'idf')
            idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above

            #to get the words in columns implement count vectorizers
            idf_title_vectorizer = CountVectorizer()
            vectorizer = idf_title_vectorizer.fit_transform(data['title'])

            id_ = 0 
            w2vec_title_weight = []


            #building vector for each title
            for i in self.data['title']:
                w2vec_title_weight.append(avg_word_vec(i,300,id_,'weighted',idf_title_vectorizer = idf_title_vectorizer ,idf_title_features = idf_title_features))
                id_ += 1

            #w2v_title_weight = np.array(# number of doc/rows in courpus * 300) 
            w2vec_title_weight = np.array(w2vec_title_weight)

            #saving dataframe in a npz file
            savez_compressed("Pickle/Word2Vec_weighted.npz",w2vec_title_weight)

            return w2vec_title_weight
        
        
    def distance_similarity(self):
        #data : data contaning text for vectorization 
        #model : method used for text vectorization
        #Cut_off : the number of recommendations we give out
        #df :  data set used to retrieve orignal movie description and genre
        
        if self.model == 'bag_of_words':  
            title_features,title_vectorizer = self.Vectorization()

            #doc_id is id on the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distances saves the distance between given input product and all other products
            pairwise_dist = pairwise_distances(title_features,title_features[self.doc_id],metric = 'cosine')

            #np.argsort returns indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])
            
            for i in range(0,len(data_indices)):
                visualization(indices[i], self.data['title'].loc[data_indices[0]], self.data['title'].loc[data_indices[i]], self.data['medium_image_url'].loc[data_indices[i]], 'bag_of_words',tfidf_title_vectorizer = 0,tfidf_title_features = 0, idf_title_vectorizer = 0,idf_title_features = 0)
                print('The amazon ID of the apparel is {}'.format(self.data['asin'].loc[data_indices[i]]))

        elif self.model == 'Tfidf':
            #storing array after vectorization
            tfidf_title_features,tfidf_title_vectorizer = self.Vectorization()

            #doc_id is the id in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance saves the distance between given input product and all other products
            pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[self.doc_id],metric = 'cosine')

            #np.argsort returns indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                visualization(indices[i], self.data['title'].loc[data_indices[0]], self.data['title'].loc[data_indices[i]], self.data['medium_image_url'].loc[data_indices[i]], 'Tfidf',tfidf_title_vectorizer,tfidf_title_features ,idf_title_vectorizer=0,idf_title_features=0)
                print('The amazon ID of the apparel is {}'.format(self.data['asin'].loc[data_indices[i]]))
                
        elif self.model == 'idf':
            #do not use vectorizer as it is computationally expensive to vectorize everytime
            #Load the saved vectorized sparse array .npz
            #title_features= Vectorization(data,'idf')
            idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above
            
            idf_title_features =idf_title_features.toarray()
            
            #to get the words in columns implement count vectorizers
            idf_title_vectorizer = CountVectorizer()
            vectorizer = idf_title_vectorizer.fit_transform(self.data['title'])

            #doc_id is the id in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance will save the distance between given input product and all other products
            pairwise_dist = pairwise_distances(idf_title_features,idf_title_features[self.doc_id].reshape(1,-1),metric = 'cosine')

            #np.argsort will return indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                visualization(indices[i], self.data['title'].loc[data_indices[0]], self.data['title'].loc[data_indices[i]], self.data['medium_image_url'].loc[data_indices[i]], 'idf', tfidf_title_vectorizer=0, tfidf_title_features=0, idf_title_vectorizer = idf_title_vectorizer, idf_title_features = idf_title_features)
                print('The amazon ID of the apparel is {}'.format(self.data['asin'].loc[data_indices[i]]))
        
        elif self.model == 'avg':
            #Word2Vec_features = Vectorization(data['title'],'avg')
            #do not use vectorizer as it is computationally expensive to vectorize everytime 
            #Load the stored vectorized array .npz
            Word2Vec_features = load("Pickle/Word2Vec_aveg.npz")
                      
            #uncompresing npz to numpy array array
            Word2Vec_features  = Word2Vec_features['arr_0']

            #doc_id is the id of the product in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance will save the distance between given input product and all other products
            pairwise_dist = pairwise_distances(Word2Vec_features,Word2Vec_features[self.doc_id].reshape(1,-1))

            #np.argsort will return indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                results_Word2Vec(data['title'].loc[data_indices[0]], data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], indices[0], indices[i],'avg',idf_title_vectorizer = 0,idf_title_features = 0)
                print('The amazon ID of the apparel is {}'.format(self.data['asin'].loc[data_indices[i]]))
                      
        elif self.model == 'weighted':
            #do not use vectorizer as it is computationally expensive to vectorize everytime
            #Load the saved vectorized sparse array .npz
            #title_features= Vectorization(data,'weighted')
            idf_title_features = scipy.sparse.load_npz('Pickle/idf_title_features.npz') #OR we can Vectorize using the code above
            
        
            #to get the words in columns CountVectorizer
            idf_title_vectorizer = CountVectorizer()
            vectorizer = idf_title_vectorizer.fit_transform(self.data['title'])

            #Word2Vec_features = Vectorization(data['title'],'avg')
            #do not use vectorizer as it is computationally expensive to vectorize everytime 
            #Load the stored vectorized array .npz
            Word2Vec_features = load("Pickle/Word2Vec_weighted.npz") #OR we can Vectorize using the code above

            #uncompresing npz to numpy array array
            Word2Vec_feature  = Word2Vec_features['arr_0']

            #doc_id is the id in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance will save the distance between given input product and all other products
            pairwise_dist = pairwise_distances( Word2Vec_feature, Word2Vec_feature[self.doc_id].reshape(1,-1))

            #np.argsort will return indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index of the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                results_Word2Vec(self.data['title'].loc[data_indices[0]], self.data['title'].loc[data_indices[i]], data['medium_image_url'].loc[data_indices[i]], indices[0], indices[i],'weighted',idf_title_vectorizer,idf_title_features)
                print('The amazon ID of the apparel is {}'.format(data['asin'].loc[self.data_indices[i]]))
    
    