# Importing Libraries and Storing Googles Word2vec libray

In [2]:
#Importing the necessary pacakages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import scipy.sparse
from numpy import savez_compressed
from numpy import load
import warnings
warnings.filterwarnings("ignore")
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import math
from ipynb.fs.full.Visualization import *

################################################################################################################################

#Downloading Googles Word2Vec library to be used in all word to vec models using a pretrained model by google
#download "GoogleNews-vectors-negative300.bin" 
modl = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#vocab = stores all the words in google Word2vec model
vocab = modl.index_to_key

# Result Class:

In [3]:
class results():
    
    def __init__(self,doc_id,model,cut_off):        
        
        #initializing the movie for recommendation
        self.doc_id = doc_id
        
        #initialzing the model to be used
        self.model = model
        
        #initialzing the data to be modelled
        self.data = pd.read_pickle('Pickle/preprocessed_data')
        
        #assigning data frame used to display results
        self.df = pd.read_pickle('Pickle/original_data')
        
        #the number of recommendations we require
        self.cut_off = cut_off
        
    def Vectorization(self):
        #data : Data set containing text data
        #model : method used for text vectorization

        if self.model == 'bag_of_words':
            #Vectorization using Bag of words
            title_vectorizer = CountVectorizer()
            title_features = title_vectorizer.fit_transform(self.data['text'])   
            return title_features,title_vectorizer

        elif self.model == 'Tfidf':
            #Vectorization using tfidfVectorizer
            tfidf_title_vectorizer = TfidfVectorizer()
            tfidf_title_features = tfidf_title_vectorizer.fit_transform(self.data['text'])
            return tfidf_title_features,tfidf_title_vectorizer
        
        elif self.model == 'avg':
            w2vec_title_features = []
            #building vector for each title 
            for i in self.data['text']:
                w2vec_title_features.append(avg_word_vec(i,300))

            #w2v_title_features = np.array(# number of doc/rows in courpus * 300) 
            Word2Vec_features = np.array(w2vec_title_features)

            #saving dataframe in a npz file
            savez_compressed("Pickle/Word2Vec_avg.npz",Word2Vec_features)
            
            return Word2Vec_features
        
    def distance_similarity(self):
        #data : data contaning text for vectorization 
        #model : method used for text vectorization
        #Cut_off : the number of recommendations we give out
        #df :  data set used to retrieve orignal movie description and genre
        
        if self.model == 'bag_of_words':  
            title_features,title_vectorizer = self.Vectorization()

            #doc_id is id on the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distances saves the distance between given input product and all other products
            pairwise_dist = pairwise_distances(title_features,title_features[self.doc_id],metric = 'cosine')

            #np.argsort returns indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                print('The Netflix movie title is {}\n\n'.format(self.data['title'].loc[data_indices[i]]))
                print('The movie description is: \n{}\n\n'.format(self.df['description'].loc[data_indices[i]]))
                print('The movie is listed under:\n{}\n\n'.format(self.df['listed_in'].loc[data_indices[i]]))
                visualization(indices[i],self.data['text'].loc[data_indices[0]],self.data['text'].loc[data_indices[i]],
                              'bag_of_words',tfidf_title_vectorizer=0,tfidf_title_features=0)

        elif self.model == 'Tfidf':
            #storing array after vectorization
            tfidf_title_features,tfidf_title_vectorizer = self.Vectorization()

            #doc_id is the id in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance saves the distance between given input product and all other products
            pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[self.doc_id],metric = 'cosine')

            #np.argsort returns indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                visualization(indices[i], self.data['text'].loc[data_indices[0]],self.data['text'].loc[data_indices[i]],
                              'Tfidf',tfidf_title_vectorizer,tfidf_title_features)
                
                print('The Netflix movie title is {}\n\n'.format(self.data['title'].loc[data_indices[i]]))
                print('The movie description is: \n{}\n\n'.format(self.df['description'].loc[data_indices[i]]))
                print('The movie is listed under: \n{}\n\n'.format(self.df['listed_in'].loc[data_indices[i]]))

        elif self.model == 'avg':
            #Word2Vec_features = Vectorization(data['title'],'avg')
            #do not use vectorizer as it is computationally expensive to vectorize everytime 
            #Load the stored vectorized array .npz
            Word2Vec_features = load("Pickle/Word2Vec_avg.npz") 

            #uncompresing npz to numpy array array
            Word2Vec_features  = Word2Vec_features['arr_0']

            #doc_id is the id of the product in the new index formed after CountVectorizer is applied to the data['title']
            #pairwise distance will save the distance between given input product and all other products
            pairwise_dist = pairwise_distances(Word2Vec_features,Word2Vec_features[self.doc_id].reshape(1,-1))

            #np.argsort will return indices of the smallest distances
            indices = np.argsort(pairwise_dist.flatten())[:self.cut_off]

            #get the index id of product in the original dataframe
            data_indices = list(self.data.index[indices])

            for i in range(0,len(data_indices)):
                print('The Netflix movie title is {}\n\n'.format(self.data['title'].loc[data_indices[i]]))
                print('The movie description : \n{}\n\n'.format(self.df['description'].loc[data_indices[i]]))
                print('The movie is listed under: \n{}\n\n'.format(self.df['listed_in'].loc[data_indices[i]]))
                results_Word2Vec(self.data['text'].loc[data_indices[0]],self.data['text'].loc[data_indices[i]],
                                 indices[0],indices[i])


# Additional function needed for Word2Vec Vectorization

In [None]:
#do not vectorizer as it is computationally expensive to vectorize everytime 
#Use stored vectorized array .npz'''
#Use this function incase we use Word2Vecvectorization 
'''
#define Functions for AVG Word2Vec vectorization
#We perform Word2Vec vectorization in advance and store the vectorized array to be used directly in distance 
#(continued) based similarity recommendation 
#as Word2Vec vectorization in computationally intensive as compared bag of words and tfidf vectorization.

def avg_word_vec(sentence,no_features):
    
    # sentence: title of the apparel
    # num_features: the lenght of word2vec vector, its values = 300
    # model_name: model information
    
    featureVec = np.zeros(shape=(300,), dtype="float32")
    # intialize a vector of size 300 with all zeros
    # add each word2vec(wordi) to this fetureVec

    ncount = 0
    for word in sentence.split():
        ncount += 1
        if word in vocab:
                featureVec = np.add(featureVec,modl[word])
            
        if (ncount>0):
            featureVec = np.divide(featureVec,ncount)

    #return avg vec
    return featureVec    
'''