In [None]:
'''
Word Embedding
    - Word Embedding is a language modeling technique used for mapping words to vectors of real numbers. 
    - It represents words or phrases in vector space with several dimensions.
    - Word embeddings can be generated using various methods like neural networks, co-occurrence matrix, probabilistic models, 
      etc.

In [None]:
'''
What is Word2Vec?
    - Word2Vec consists of models for generating word embedding. 
    - Word2vec is the technique/model to produce word embedding for better word representation. 
    - It captures a large number of precise syntactic and semantic word relationship.
    
What word2vec does?
    - Word2vec represents words in vector space representation. 
    - Words are represented in the form of vectors and placement is done in such a way that similar meaning words appear 
      together and dissimilar words are located far away.This is also termed as a semantic relationship. 
    - Neural networks do not understand text instead they understand only numbers. 

Word Embedding provides a way to convert text to a numeric vector.

In [None]:
'''
Why Word2Vec?
    - Word2vec represents words in vector space representation. Words are represented in the form of vectors and placement is 
      done in such a way that similar meaning words appear together and dissimilar words are located far away.This is also 
      termed as a semantic relationship
    - Word2vec reconstructs the linguistic context of words.

In [None]:
'''
Word2vec learns word by predicting its surrounding context. 


In [None]:
'''
Word2vec is not a single algorithm but a combination of two techniques
    - CBOW(Continuous bag of words)
    - Skip-gram model.

Both of these are shallow neural networks(One Hidden layer) which map word(s) to the target variable which is also a word(s).
Both of these techniques learn weights which act as word vector representations. 
Let us discuss both these methods separately and gain intuition into their working.

In [None]:
'''
What is FastText

In [None]:
'''
Gensim  : "Generate Similar"
    - Gensim is a free open-source Python library for representing documents as semantic vectors
    - Gensim is billed as a Natural Language Processing package that does ‘Topic Modeling for Humans’.
    - The training algorithms in the Gensim package were actually ported from the original Word2Vec implementation by Google 
      and extended with additional functionality.
    - Gensim toolkit allows users to import Word2vec for topic modeling to discover hidden structure in the text body.
    - Gensim provides not only an implementation of Word2vec but also for Doc2vec and FastText as well.
    

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Create Vector Representation of Words

In [None]:
from gensim.models import Word2Vec

# define training data
sentences = [['this','is','first','sentence','of','the','document']]

# train model
model = Word2Vec(sentences, min_count=1)

print(model)
print(list(model.wv.vocab))

X = model[model.wv.vocab]

X.shape  # There are seven words with dimention of 100

### Get Vocabulary List

In [None]:
# define training data

sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences,min_count=1)

# summarize vocabulary
words = list(model.wv.vocab)

print(words)

### Get the vector (Word Embedding) for the word 'and' 

In [None]:
print(model['and'])

### Generate BOW and TFIDF

In [3]:
import gensim
import os
from gensim import corpora
from gensim import models

documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

# Preprocess the Dataset

# List of Tokens
tokenized  = [doc.split() for doc in documents]

print(tokenized)

# Create a Dictionary

# storing the extracted tokens into the dictionary
my_dictionary = corpora.Dictionary(tokenized)

print("\nmy_dictionary:",my_dictionary)

# convertig to a bag of word corpus

BOW_corpus = [my_dictionary.doc2bow(doc, allow_update = True) for doc in tokenized]
print("\nBOW_corpus: \n",BOW_corpus)

print("\n")
#  Create a TFIDF matrix in Gensim
import numpy as np
word_weight =[]
for doc in BoW_corpus:
    for id, freq in doc:
        word_weight.append([my_dictionary[id], freq])

print("BOW:",word_weight)

print("\n")
# create TF-IDF model
tfIdf = models.TfidfModel(BoW_corpus, smartirs ='ntc')

# TF-IDF Word Weight
weight_tfidf =[]
for doc in tfIdf[BoW_corpus]:
    for id, freq in doc:
        weight_tfidf.append([my_dictionary[id], np.around(freq, decimals = 3)])
        
print("weight_tfidf:",weight_tfidf) 

[['The', 'Saudis', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], ['Saudi', 'journalist', 'Jamal', "Khashoggi's", 'death', 'was', 'the', 'result', 'of', 'an'], ['interrogation', 'that', 'went', 'wrong,', 'one', 'that', 'was', 'intended', 'to', 'lead'], ['to', 'his', 'abduction', 'from', 'Turkey,', 'according', 'to', 'two', 'sources.']]

my_dictionary: Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)

BOW_corpus: 
 [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)], [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(7, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(23, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]


BOW: [['Saudis', 1], ['The', 1], ['a', 1], ['acknowledge', 1], ['are', 1], ['preparing', 1], ['report', 1], ['that', 2], ['will', 1], ['Jamal', 1], ["Khashoggi's", 1], ['Saudi', 1], ['an', 1], ['death'

### Find Similarity between the words

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim

print(common_texts)

# Similarity between two different words
model = Word2Vec(common_texts,size =100,window=2,min_count=1,workers=4)
model.wv.similarity('minors','human')


### Find Most Similar words Top N

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim

# Look up top 6 words similar to 'minors'

model = Word2Vec(common_texts,size =100,window=2,min_count=1,workers=4)
model.wv.most_similar('minors',topn=6)

In [None]:
import nltk
#nltk.download('brown')
#nltk.download('movie_reviews')
#nltk.download('treebank')

import gensim
from nltk.corpus import brown,movie_reviews,treebank

b = Word2Vec(brown.sents())
mr = Word2Vec(movie_reviews.sents())
t = Word2Vec(treebank.sents())

b.most_similar('money', topn=5)
t.most_similar('money', topn=5)
b.most_similar('great', topn=5)
mr.most_similar('great', topn=5)
t.most_similar('great', topn=5)

b.most_similar('company', topn=5)
mr.most_similar('company', topn=5)
t.most_similar('company', topn=5)

### Find Similarity between the words

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the vector for the Words to compare
w1 = model['sentence']
w2 = model['word2vec']

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(w1.reshape(1,-1),w2.reshape(1,-1))

### Semantic Relationship Find Most Similar words in Corpus data

In [None]:
#import nltk
#nltk.download('abc')

import gensim
from nltk.corpus import abc

print("Total Sentence:",len(abc.sents()))

# Create a Word2vec Model for the 'Sentences' in abc Corpus
model= gensim.models.Word2Vec(abc.sents())

X= list(model.wv.vocab)

print("Total Vocabulary List:",len(X))

# Get the most Similar words
data=model.wv.most_similar('science')

print("\n")
print(data)

### Representing the Vector of Words in 2D using PCA

In [None]:
# Visualize the words

#  Retrieve all of the vectors from a trained model

X = model[model.wv.vocab]

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
result = pca.fit_transform(X)

import matplotlib.pyplot as plt

plt.scatter(result[:, 0], result[:, 1])

words = list(model.wv.vocab)
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()

### Save and Load Model

In [None]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
            ['this', 'is', 'the', 'second', 'sentence'],
            ['yet', 'another', 'sentence'],
            ['one', 'more', 'sentence'],
            ['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1)

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print(words)

# access vector for one word
print(model['sentence'])

# save model
model.save('model.bin')
# load model

new_model = Word2Vec.load('model.bin')
print(new_model)

# If you save the model you can continue training it later:
# model.train([["hello", "world"]], total_examples=1, epochs=1)

### Parameter Settings while Creatin Word2Vec Model

In [None]:

model = Word2Vec(common_texts,size=150, window=10, min_count=2, workers=10, iter=10)

# Size : The size of the Dence vector to represent each token or word

# Widnow : The maximum distance between the target word and its neighboring word default value is 5

# min_count : Ignore all the words where frequency of each word is less than min_count, default value is 5. 
#             As we wanted to add all words in corpus, so value we provided is 1

# Workers : How Many Threadd to use behind the scence

# Iter : Number of iterations(Epochs) over the corpus

# sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.


### Visualize using TSNE

In [None]:
from gensim.models import Word2Vec
sentences = [["cat", "say", "meow"], 
             ["dog", "say", "woof"]]

model = Word2Vec(sentences, min_count=1)

#"Creates and TSNE model and plots it"

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model[word])
    labels.append(word)
    
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)

x = []
y = []

for value in new_values:
    x.append(value[0])
    y.append(value[1])
    
plt.figure(figsize=(5, 3)) 

for i in range(len(x)):
    plt.scatter(x[i],y[i])
    
    plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2),textcoords='offset points',ha='right', va='bottom')

plt.show()

### Topic Modeling

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import models,corpora

# File Name
input_file = "data_topic_modeling.txt"

# Load input file
def load_data(input_file):
    data = []
    f = open(input_file,'r')
    for line in f.readlines():
        data.append(line[:-1])
    return data

class Preprocessor(object):
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stop_words_english = stopwords.words('english')
        self.stemmer = SnowballStemmer('english')
        
    def process(self,input_text):
        tokens = self.tokenizer.tokenize(input_text.lower())
        
        tokens_stopwords = [x for x in tokens if not x in self.stop_words_english]
        
        tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]
        
        return tokens_stemmed
        
    if __name__=='__main__':
        input_file = input_file
        data = load_data(input_file)
        
# Load the Data
data = load_data(input_file)

print("Raw Text: \n",data,"\n")

# Preprocess the data
# Create a preprocessor object
preprocessor = Preprocessor()

# Create a list for processed documents
processed_tokens = [preprocessor.process(x) for x in data]

# Create a dictionary based on the tokenized documents
dict_tokens = corpora.Dictionary(processed_tokens)

corpus = [dict_tokens.doc2bow(text) for text in processed_tokens]

print(corpus)

num_topics = 2
num_words = 4

# Build Model
ldamodel = models.ldamodel.LdaModel(corpus,num_topics=num_topics, id2word=dict_tokens, passes=25)

print("\nMost contributing words to the topics")

for item in ldamodel.print_topics(num_topics=num_topics,num_words=num_words):
    print("Topic",item[0], "==>",item[1])

In [None]:
### Distance and 

In [None]:
import pandas as pd
import numpy as np
import string
import nltk

df = pd.read_csv("movie_data.csv",encoding='utf-8').sample(n=1000, random_state=1)

lines = df['review'].values.tolist()


print("Raw data Sample:\n",lines[0])

# Preprocess the data

import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines  = list()

for line in lines:
    tokens = word_tokenize(line)
    
    tokens = [w.lower() for w in tokens]
    
    table = str.maketrans('','',string.punctuation)
    
    stripped = [w.translate(table) for w in tokens]
    
    words = [word for word in stripped if word.isalpha()]
    
    stop_words = set(stopwords.words('english'))
    
    words = [w for w in words if not w in stop_words]
    
    review_lines.append(words)
    
print("\n Preprocessed Data Sample:\n",review_lines[0])

import gensim

# Build Model
model = gensim.models.Word2Vec(sentences = review_lines,size =100,window=5,workers =4,min_count =1)

print("Vocabulary:")

# Vocab Size
list(model.wv.vocab)[:10]

In [None]:
model['powerful']

In [None]:
model.wv.most_similar('powerful')

In [None]:
model.wv.most_similar_cosmul(positive=['women','king'],negative=['man'])

In [None]:
# Odd word out
model.wv.doesnt_match('woman king queen movie'.split())

In [None]:
# Save Model
filename = 'imdb_embedding_word2vec.text'
model.wv.save_word2vec_format(filename,binary=False)