# Chapter 6: Exploring Sentence-, Document- and Character-Level                                                    Embeddings

## Building a Doc2Vec Model

####  1) Importing the common_texts corpus along with the Doc2Vec and TaggedDocument modules

In [1]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#### 2) Checking the training corpus

In [2]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

#### 3) Converting the tokenized documents into TaggedDocument format

In [3]:

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

#### 4) Building and training a basic Doc2Vec model

In [4]:
#vector_size denotes by how many floating point values will each document will be represented by
#the min_count acts as a parameter to set threshold so that only the terms occuring atleast min_count times 
#will be taken in the vocabulary
model = Doc2Vec(documents, vector_size=5, min_count=1, workers=4, epochs = 40)
model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)

#### 5) Validating the vector size for the document embeddings

In [5]:
model.vector_size

5

#### 6) Checking if the no. of document vectors being built is equal to the no. of documents  used in the training process

In [6]:
len(model.dv)

9

#### 7) Checking the vocabulary and the vocabulary size of the model

In [7]:
#vocab size
len(model.wv.index_to_key)

12

In [8]:
#vocabulary
model.wv.index_to_key

['system',
 'graph',
 'trees',
 'user',
 'minors',
 'eps',
 'time',
 'response',
 'survey',
 'computer',
 'interface',
 'human']

#### 8) Building a document vector for a new sentence:

In [9]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.05171022 -0.05739109  0.0481263   0.01538622  0.04605532]


### Changing vector size and min_count

#### 1) Building our Doc2Vec model

In [10]:
model = Doc2Vec(documents, vector_size=50, min_count=3, epochs=40)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

#### 2) Checking the vocabulary size

In [11]:
len(model.wv.index_to_key)

4

#### 3) Checking the vocabulary 

In [12]:
model.wv.key_to_index

{'system': 0, 'graph': 1, 'trees': 2, 'user': 3}

#### 4) Building a new paragraph vector using the Doc2Vec model

In [13]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.00493748 -0.00547006  0.00542764  0.00204919  0.00450158 -0.00186988
  0.00702156  0.00560433 -0.00585779  0.00422477 -0.00916069 -0.00756148
  0.00802777 -0.00669308 -0.00808496 -0.00552981  0.00827915  0.00071997
 -0.00040661 -0.00897476  0.0060303  -0.00971152  0.00171542 -0.00404753
  0.00748146  0.00549066  0.00172136  0.00109212  0.0071558  -0.0052976
 -0.00870732 -0.00830556  0.0057922   0.00044906  0.00784768 -0.00902211
 -0.00028359  0.00934035 -0.00372939 -0.00687985  0.00696723  0.00586227
 -0.0095457  -0.00774136 -0.00686943 -0.0038848  -0.00944106 -0.00584005
  0.00690779  0.00403548]


### The dm parameter for switching between modeling approaches

#### Building a PV-DM model (dm = 1)

In [14]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

#### Building a PV-BOW model (dm = 0)

In [15]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### The dm_concat parameter (PV - DM)

#### When set to 1, indicates to the algorithm that the context vectors should be concatenated while trying to predict the target word. This leads to building a larger model since multiple word embeddings get concatenated.

In [16]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, 
                min_alpha=0.05, dm_concat=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### The dm_mean parameter (PV - DM)

#### When set to 1, the mean of the context word vectors is taken.                                                                                                        The sum of the context word vectors is taken into account when set to 0.

In [17]:
#taking mean of context vectors
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1,
dm_concat=0, dm_mean=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [18]:
#taking sum of context vectors
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1,
dm_concat=0, dm_mean=0, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### Window Size

#### This parameter controls the distance between the word under concentration and the word to be predicted

In [19]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### Learning Rate

#### With the min_alpha parameter, we can specify what value the learning rate should drop to over the course of training

In [20]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

## Building a fastText Model

#### 1) Importing the necessary libraries and dataset

In [21]:
from gensim.models import FastText
from gensim.test.utils import common_texts

#### 2) Instantiating and training a basic FastText model

In [22]:
model = FastText(vector_size=5, window=3, min_count=1)
model.build_vocab(corpus_iterable=common_texts)
model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10)

(36, 290)

#### 3) Validating our vocabulary

In [23]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

#### Visualizing the vector of the word "human"

In [24]:
model.wv['human']

array([-0.03166138,  0.0232673 ,  0.01241681,  0.00036033,  0.02841444],
      dtype=float32)

#### 4) Checking for the closest vector

In [25]:
model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])

[('user', 0.7968782186508179),
 ('system', 0.17462214827537537),
 ('response', 0.10433417558670044),
 ('survey', 0.009605271741747856),
 ('trees', -0.0764053612947464),
 ('time', -0.13300469517707825),
 ('minors', -0.1392730176448822),
 ('eps', -0.2409365326166153),
 ('graph', -0.29175299406051636)]

#### 5) min_n and max_n parameters

In [26]:
#sets the minimum (min_n) and maximum (max_n) lengths of the character n-grams to build representations.
#useing a range of 1-gram to 5-grams to build the fastText model
model = FastText(vector_size=5, window=3, min_count=1, min_n=1, max_n=5)
model.build_vocab(corpus_iterable=common_texts)
model.train(corpus_iterable=common_texts, total_examples=len(common_texts),epochs=10)

(36, 290)

#### 6) Building a representation of a word that does not occur in the vocabulary

In [27]:
model.wv['rubber']

array([ 0.01833103, -0.02146882,  0.00600104, -0.03445043, -0.01658661],
      dtype=float32)

#### 7) Using an out-of-vocabulary term in the most_similar function

In [28]:
model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])

[('trees', 0.795038104057312),
 ('eps', 0.7793108820915222),
 ('minors', 0.24405993521213531),
 ('time', 0.16231966018676758),
 ('user', -0.04820769280195236),
 ('graph', -0.15672095119953156),
 ('survey', -0.20417729020118713),
 ('interface', -0.392148494720459),
 ('response', -0.6897363662719727),
 ('system', -0.8435081243515015)]

#### 8) Extending our model so that it incorporates new sentences and vocabulary

In [29]:
sentences_to_be_added = [["I", "am", "learning", "Natural", "Language","Processing"], 
                         ["Natural", "Language", "Processing", "is", "cool"]]
model.build_vocab(sentences_to_be_added, update=True)
model.train(corpus_iterable=common_texts, total_examples=len(sentences_to_be_added), epochs=10)

(43, 290)

## Building a spelling corrector/word suggestion module using fastText

#### 1) Importing the necessary libraries

In [30]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

#### 2) Reading the data into basic data structures

In [46]:
words = []
data = []
with io.open('comment_text.txt', 'r', encoding='utf-8') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())

#### 3) Fetching basic information about the data in terms of the most common words in the corpus

In [47]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)

[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

#### 4) Preprocessing the data using the preprocessing pipeline

In [48]:
#The different process present in the pipeline

#tokenizing
def tokenizer(corpus, keep_list = []):
    cleaned_rows = []
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub('[^a-zA-Z0-9]', ' ', word).lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_rows.append(' '.join(qs))
    return pd.Series(cleaned_rows)

#removing stopwords
def remove_stops(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[word for word in sentence.split() if word not in stop] for sentence in corpus]
    return corpus

#stemming
def stemmer(corpus, stem_type):
    if stem_type == 'Porter':
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(word) for word in sentence] for sentence in corpus]          

    if stem_type == 'Snowball':
        stemmer = SnowballStemmer(language='english')
        corpus = [' '.join([stemmer.stem(word) for word in sentence]) for sentence in corpus] 

        return corpus
#lemmatization
def lemmatizer(corpus):
    lemmatizer = WordNetLemmatizer()
    corpus = [' '.join([lemmatizer.lemmatize(x, pos = 'v') for x in x]) for x in corpus]
    return corpus

#function to preprocess
def preprocess(corpus, keep_list = [], stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    corpus = tokenizer(corpus, keep_list)
    
    if remove_stopwords:
        corpus = remove_stops(corpus)
    
    if stemming:
        corpus = stemmer(corpus, stem_type)
        
    if lemmatization:
        corpus = lemmatizer(corpus)
    
    corpus = [' '.join(x) for x in corpus]     
    
    return corpus  

In [49]:
import pandas as pd
data = preprocess(data)

In [50]:
data

['comment text',
 'explanation',
 'edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired 89 205 38 27',
 'aww matches background colour seemingly stuck thanks talk 21 51 january 11 2016 utc',
 'hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 '',
 '',
 'make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know',
 '',
 'appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport',
 'sir hero chance remember page',
 '',
 '',
 'congratulations well use tools well talk',
 'cocksucker piss around work',
 'vandalism matt shirvington article reverted plea

#### 5) Modifying  data so that it meets fastText requirements.

In [51]:
preprocessed_data = []
for line in data:
    if line != "":
        preprocessed_data.append(line.split())

#### 6) Initializing our fastText model:

In [53]:
model = FastText(vector_size=300, window=3, min_count=1, min_n=1, max_n=5)

#### 7) Building the vocabulary and checking the size of the vocabulary

In [55]:
#building vocabulary
model.build_vocab(corpus_iterable=preprocessed_data)

In [57]:
#checking size of vocabulary
len(model.wv.key_to_index)

182220

#### 8) Training our model

In [60]:
model.train(corpus_iterable = preprocessed_data, total_examples=len(preprocessed_data), epochs=10)

(55532376, 57208880)

#### 9) checking if the model can predict the correct spelling for the incorrect words as part of the top 5 similar suggestions

In [61]:
#for the word 'explain'
model.wv.most_similar('eplain', topn=5)

[('xplain', 0.9154737591743469),
 ('eexplain', 0.9123345017433167),
 ('plain', 0.9036255478858948),
 ('exlain', 0.8988020420074463),
 ('elain', 0.8983729481697083)]

In [62]:
#for the word 'reminder'
model.wv.most_similar('reminder', topn=5)

[('rejoinder', 0.9143153429031372),
 ('remainder', 0.9123693704605103),
 ('reindeer', 0.9083667993545532),
 ('reminde', 0.9032237529754639),
 ('reminders', 0.9031974673271179)]

In [63]:
#for the word 'relevnt'
model.wv.most_similar('relevnt', topn=5)

[('releveant', 0.9456570744514465),
 ('relent', 0.9408718347549438),
 ('relevanmt', 0.9396296143531799),
 ('relevent', 0.9396237730979919),
 ('relevant', 0.9327782392501831)]

In [64]:
#for the word 'purse'
model.wv.most_similar('purse', topn=5)

[('purpse', 0.9215287566184998),
 ('cpurse', 0.9101640582084656),
 ('pure', 0.89461350440979),
 ('pursue', 0.8822980523109436),
 ('pulse', 0.8715043663978577)]

## fastText and Document Distances

#### 1) Initializing the sentences that we want to find  distances between

In [65]:
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

#### 2) Computing the distance between the document pairs using WMD

##### WMD between sentence_1 and sentence_2 using fastText-based vectors

In [67]:
word_mover_distance = model.wv.wmdistance(sentence_1, sentence_2)
word_mover_distance

0.4865180245149761

##### WMD between sentence_2 and sentence_3 using fastText-based vectors

In [69]:
word_mover_distance = model.wv.wmdistance(sentence_2, sentence_3)
word_mover_distance

0.6030960327153282