In [1]:
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re

In [2]:
from PIL import Image
from io import BytesIO
from nltk.tokenize import RegexpTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity



In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/data.csv", filename="DataSet/Book_Data.csv")

('DataSet/Book_Data.csv', <http.client.HTTPMessage at 0x4c5cf640>)

In [4]:
df = pd.read_csv("DataSet/Book_Data.csv")

In [5]:
print('Count of lists(documents): ',len(df))

Count of lists(documents):  2382


In [6]:
df[:5]

Unnamed: 0.2,Unnamed: 0,Desc,Unnamed: 0.1,author,genre,image_link,rating,title
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...
1,1,Following the success of The Accidental Billio...,1.0,Blake J. Harris,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.94,"Console Wars: Sega, Nintendo, and the Battle t..."
2,2,How to tap the power of social software and ne...,2.0,Chris Brogan,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.78,Trust Agents: Using the Web to Build Influence...
3,3,William J. Bernstein is an American financial ...,3.0,William J. Bernstein,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.2,The Four Pillars of Investing
4,4,Amazing book. And I joined Steve Jobs and many...,4.0,Akio Morita,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.05,Made in Japan: Akio Morita and Sony


In [7]:
def _removeNonAscii(s) :
    return "".join(i for i in s if ord(i)<128)

In [8]:
def romove_stop_words(text) :
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    
    return text

In [9]:
def remove_html(text) :
    html_pattern = re.compile("<.*?>")
    
    return html_pattern.sub('', text)

In [10]:
def remove_punctuation(text) :
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+")
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    
    return text

In [11]:
df["cleaned"] = df["Desc"].apply(_removeNonAscii)
df["cleaned"] = df.cleaned.apply(lambda x: x.lower())
df["cleaned"] = df.cleaned.apply(romove_stop_words)
df["cleaned"] = df.cleaned.apply(remove_punctuation)
df["cleaned"] = df.cleaned.apply(remove_html)

In [12]:
df["cleaned"][:5]

0    know power shifting west east north south pres...
1    following success accidental billionaires mone...
2    tap power social software networks build busin...
3    william j bernstein american financial theoris...
4    amazing book joined steve jobs many akio morit...
Name: cleaned, dtype: object

In [13]:
df['cleaned'].replace('', np.nan, inplace=True)
df = df[df["cleaned"].notna()]
print('Count of lists(documents): ',len(df))

Count of lists(documents):  2381


In [14]:
corpus = []
for words in df["cleaned"] :
    corpus.append(words.split())

In [15]:
print(df)
print(corpus)

      Unnamed: 0                                               Desc  \
0              0  We know that power is shifting: From West to E...   
1              1  Following the success of The Accidental Billio...   
2              2  How to tap the power of social software and ne...   
3              3  William J. Bernstein is an American financial ...   
4              4  Amazing book. And I joined Steve Jobs and many...   
...          ...                                                ...   
2377        2446  Ralph Roberts, a sus setenta años y tras la mu...   
2378        2447  Murder at the Vicarage marks the debut of Agat...   
2379        2448  In 1951 John Wyndham published his novel The D...   
2380        2449  This now classic book revealed Flannery O'Conn...   
2381        2450  Imbued on every page with Frank McCourt's asto...   

      Unnamed: 0.1                author        genre  \
0              0.0           Moisés Naím     Business   
1              1.0       Blake J.

In [16]:
word2vec_model = Word2Vec(size = 300, window = 5, min_count = 2, workers = -1) #size = vector_size
word2vec_model.build_vocab(corpus)

In [17]:
word2vec_model.intersect_word2vec_format("F:/Study/DataSet/GoogleNews-vectors-negative300.bin", lockf=1.0, binary=True)
word2vec_model.train(corpus, total_examples = word2vec_model.corpus_count, epochs = 15)

(0, 0)

In [18]:
print(word2vec_model.wv.vocab)



In [19]:
#Average of word vectors
def get_documents_vectors(documents_list) :
    
    for line in documents_list :
        doc2vec = None
        count = 0
        for word in line.split() :
            if word in word2vec_model.wv.vocab :
                count += 1
                
                if doc2vec is None :
                    doc2vec = word2vec_model[word]
                else :
                    doc2vec = doc2vec + word2vec_model[word]
                    
        if doc2vec is not None :
            doc2vec /= count
        document_embedding_list = []
        document_embedding_list.append(doc2vec)
        
    return document_embedding_list

In [20]:
document_embedding_list = get_documents_vectors(df['cleaned'])
print(document_embedding_list[0])

  doc2vec = word2vec_model[word]
  doc2vec = doc2vec + word2vec_model[word]


[ 5.33738509e-02  4.52673137e-02  8.91612098e-03  9.59747285e-02
  1.56769007e-02  2.64864508e-02  7.58657232e-02 -4.48932350e-02
  1.04721569e-01  7.52510801e-02 -1.68741513e-02 -1.31406143e-01
 -5.09279408e-03  5.99772222e-02 -7.06116185e-02  1.09244548e-01
  4.69241738e-02  1.34196103e-01 -3.30014410e-03 -2.86728330e-02
  1.62470771e-03  3.04221604e-02  7.72684589e-02 -3.33557650e-02
  1.45250477e-03 -4.51104194e-02 -7.01279044e-02  9.03293192e-02
  5.02147116e-02 -8.63790698e-03 -6.24744631e-02 -3.61977965e-02
 -5.13292477e-02  3.85703482e-02 -2.57432908e-02  9.69721843e-03
  3.62556130e-02 -6.32482907e-03 -1.62065756e-02  7.01200515e-02
  8.38074833e-02 -3.53125706e-02  7.22916052e-02  2.16780659e-02
  3.24960761e-02 -1.77222788e-02 -3.45019810e-02  2.35615242e-02
 -1.55555671e-02  1.26214074e-02 -5.07415179e-03  3.64729278e-02
  1.70805387e-03  1.39397755e-02  3.16393077e-02 -1.47359101e-02
 -9.01161581e-02 -9.04032886e-02  3.99341388e-03 -7.83768371e-02
 -3.36917420e-03  7.65903

In [21]:
print("Length of document vector: ",len(document_embedding_list))

Length of document vector:  1
