In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))

In [3]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. 
NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

### Tokenization

In [4]:
from nltk import tokenize

In [5]:
tokenize.sent_tokenize(raw_txt)

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [6]:
txt_sents = tokenize.sent_tokenize(raw_txt)

In [7]:
txt_sents

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [8]:
type(txt_sents), len(txt_sents)

(list, 4)

In [9]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
type(txt_words), type(txt_words[0])

(list, list)

In [10]:
print(txt_words[:2])

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Normalizing case

In [11]:
txt_sents = [sent.lower() for sent in txt_sents]
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [12]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [13]:
print(txt_words[:2])

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp', '!'], ['we', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Removing punctuations

In [14]:
from string import punctuation

In [15]:
list_punct = list(punctuation)
print(list_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


Defining function to remove a list of terms from a given list of tokens

In [16]:
def drop_punct(input_tokens):
    return [token for token in input_tokens if token not in list_punct]

Let's test it out on some dummy tokens 

In [17]:
drop_punct(["let",".","us",".","go","!"])

['let', 'us', 'go']

Applying this to txt_words using list comprehension

In [18]:
txt_words_nopunct = [drop_punct(sent) for sent in txt_words]
print(txt_words_nopunct)

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp'], ['we', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['nlp', 'is', 'amazing', 'and', 'deep', 'learning', 'makes', 'it', 'even', 'more', 'fun'], ['let', "'s", 'learn']]


#### Removing stop words

In [19]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
list_stop = stopwords.words("english")
len(list_stop)

179

In [21]:
print(list_stop[:50])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


### Exercise 4.01

In [22]:
from nltk import tokenize

In [23]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

In [24]:
raw_txt

"Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"

In [25]:
txt_sents = tokenize.sent_tokenize(raw_txt.lower())
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

from string import punctuation
stop_punct = list(punctuation)

from nltk.corpus import stopwords
stop_nltk = stopwords.words("english")

stop_final = stop_punct + stop_nltk

In [26]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

In [27]:
txt_words_nostop = [drop_stop(sent) for sent in txt_words]
print(txt_words_nostop)

[['welcome', 'world', 'deep', 'learning', 'nlp'], ["'re", 'together', "'ll", 'learn', 'together'], ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'], ['let', "'s", 'learn']]


In [28]:
print(txt_words_nostop[0])

['welcome', 'world', 'deep', 'learning', 'nlp']


### Stemming

In [29]:
from nltk.stem import PorterStemmer

In [30]:
stemmer_p = PorterStemmer()

In [31]:
print(stemmer_p.stem("driving"))

drive


In [32]:
txt = "I mustered all my drive, drove to the driving school!"

In [33]:
tokens = tokenize.word_tokenize(txt)
print([stemmer_p.stem(word) for word in tokens])

['I', 'muster', 'all', 'my', 'drive', ',', 'drove', 'to', 'the', 'drive', 'school', '!']


### Lemmatization

In [34]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [36]:
lemmatizer.lemmatize("ponies")

'pony'

### Exercise 4.02

In [37]:
txt_words_nostop

[['welcome', 'world', 'deep', 'learning', 'nlp'],
 ["'re", 'together', "'ll", 'learn', 'together'],
 ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'],
 ['let', "'s", 'learn']]

In [38]:
from nltk.stem import PorterStemmer

In [39]:
stemmer_p = PorterStemmer()

In [40]:
print([stemmer_p.stem(token) for token in txt_words_nostop[0]])

['welcom', 'world', 'deep', 'learn', 'nlp']


Applying stemmer to all the sentences

In [41]:
txt_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in txt_words_nostop]

In [42]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

## Representation
### 1. One hot encoding

Defining function to return an indicator for each term in a list

In [43]:
txt_words_nostop

[['welcome', 'world', 'deep', 'learning', 'nlp'],
 ["'re", 'together', "'ll", 'learn', 'together'],
 ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'],
 ['let', "'s", 'learn']]

In [44]:
target_terms = ["nlp","deep","learn"]

In [45]:
target_terms

['nlp', 'deep', 'learn']

In [46]:
def get_onehot(sent):
    return [1 if term in  sent else 0 for term in target_terms]

In [47]:
one_hot_mat = [get_onehot(sent) for sent in txt_words_nostop]
one_hot_mat

[[1, 1, 0], [0, 0, 1], [1, 1, 0], [0, 0, 1]]

In [48]:
import numpy as np

In [49]:
np.array(one_hot_mat)

array([[1, 1, 0],
       [0, 0, 1],
       [1, 1, 0],
       [0, 0, 1]])

### DTM - frequencies

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
?CountVectorizer

In [52]:
vectorizer = CountVectorizer(max_features = 5)

In [53]:
vectorizer.fit(txt_sents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [54]:
vectorizer.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [55]:
txt_dtm = vectorizer.fit_transform(txt_sents)

In [56]:
txt_dtm.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 2, 2],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [57]:
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [58]:
vectorizer.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

#### Using tokenized data

In [59]:
def do_nothing(doc):
    return doc

In [60]:
vectorizer = CountVectorizer(max_features=5, 
                             preprocessor=do_nothing, 
                             tokenizer=do_nothing)

In [61]:
txt_dtm = vectorizer.fit_transform(txt_words_stem)

In [62]:
txt_dtm.toarray()

array([[0, 1, 1, 1, 0],
       [1, 0, 1, 0, 2],
       [0, 1, 1, 1, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [63]:
vectorizer.vocabulary_

{'deep': 1, 'learn': 2, 'nlp': 3, 'togeth': 4, "'ll": 0}

In [64]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

### DTM - TfIdf

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
vectorizer_tfidf = TfidfVectorizer(max_features=5)

In [67]:
vectorizer_tfidf.fit(txt_sents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [68]:
txt_tfidf = vectorizer_tfidf.transform(txt_sents)

In [69]:
vectorizer_tfidf.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [70]:
txt_tfidf.toarray()

array([[0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.25932364, 0.        , 0.25932364, 0.65783832, 0.65783832],
       [0.70710678, 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ]])

In [71]:
vectorizer_tfidf.idf_

array([1.51082562, 1.51082562, 1.51082562, 1.91629073, 1.91629073])

In [72]:
vectorizer_tfidf.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [73]:
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

## Word vectors

In [74]:
import gensim.downloader as api
from gensim.models import word2vec



In [73]:
dataset = api.load("text8")



ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

To ensure reproducible results, set random seed as 0

In [80]:
np.random.seed(1)

In [76]:
dataset = word2vec.Text8Corpus("text8")

In [81]:
model = word2vec.Word2Vec(dataset)

In [86]:
print(model.wv["animal"])

[-0.3615665  -2.5716257  -1.0519856  -1.5762082  -0.9570891  -3.1906178
 -0.7861069  -0.48413685  1.7349727   2.2464645  -0.4026107  -0.6483433
 -1.9722131  -0.16166513 -1.0183017  -1.1679416  -0.7861819   0.45336705
  0.76064676 -0.8505751   1.5522306  -1.3687619  -1.6749185   0.3126203
  0.84065163 -0.9116391  -1.3681434   2.0405495   1.1260889  -1.5723826
  1.3440136  -1.466848   -1.0229982   0.01272812 -0.1691964   0.78042233
  0.5028901   1.23014     0.48865002 -0.56637776  0.8005535   0.04091305
 -1.3809719   0.271689   -0.9696653  -0.811498    2.194633   -2.5096622
  0.78150535  1.9948106   0.46824625 -2.0970435  -1.5637726   0.01466019
  0.83421725 -0.5807966   1.5796514   1.5275073   0.11196185  2.2932727
 -0.82510984  0.6072437   1.0526761  -0.59575313  0.31435594 -0.94245505
 -0.06109677  0.5846898  -0.34168693 -0.8463029   1.2432699  -0.6183097
  0.11606545 -0.3883502  -1.5406483   1.3583179  -1.0493107   0.21840787
  1.2335353   1.0343055  -1.1972727   1.2301157  -1.139904

In [87]:
len(model.wv["animal"])

100

In [88]:
model.wv.most_similar("animal")

[('insect', 0.7598186135292053),
 ('animals', 0.729228138923645),
 ('aquatic', 0.6679497957229614),
 ('insects', 0.6522265672683716),
 ('organism', 0.6486647725105286),
 ('mammal', 0.6478426456451416),
 ('eating', 0.6435647010803223),
 ('ants', 0.6415578722953796),
 ('humans', 0.6414449214935303),
 ('feces', 0.6313734650611877)]

In [89]:
model.wv.most_similar("happiness")

[('humanity', 0.7819231748580933),
 ('perfection', 0.7699881792068481),
 ('pleasure', 0.7422512769699097),
 ('righteousness', 0.7402842044830322),
 ('desires', 0.7374188899993896),
 ('dignity', 0.7189303040504456),
 ('goodness', 0.7103697657585144),
 ('fear', 0.7047020196914673),
 ('mankind', 0.7046756744384766),
 ('salvation', 0.6990150213241577)]

#### Semantic regularities in word embeddings

In [129]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6803990602493286),
 ('empress', 0.6331825852394104),
 ('princess', 0.6145625114440918),
 ('throne', 0.6131302714347839),
 ('emperor', 0.6064509153366089)]

In [133]:
model.wv.most_similar(positive=['uncle', 'woman'], negative=['man'], topn=5)

[('aunt', 0.8145735263824463),
 ('grandmother', 0.8067640066146851),
 ('niece', 0.7993890643119812),
 ('wife', 0.7965766787528992),
 ('widow', 0.7914236187934875)]

In [144]:
model.wv.most_similar(positive=['paris', 'germany'], negative=['france'], topn=5)

[('berlin', 0.7987214922904968),
 ('vienna', 0.7287827730178833),
 ('munich', 0.7265500426292419),
 ('leipzig', 0.6589639186859131),
 ('frankfurt', 0.6196205615997314)]

### Phrase vectors from term vectors

In [108]:
v1 = model.wv['get']
v2 = model.wv['happy']
res1 = (v1+v2)/2

In [115]:
v1 = model.wv['make']
v2 = model.wv['merry']
res2 = (v1+v2)/2

In [116]:
model.wv.cosine_similarities(res1, [res2])

array([0.5798107], dtype=float32)

### Effect of parameters - Vector dimension

In [180]:
model = word2vec.Word2Vec(dataset, size=30)

Wall time: 54.5 s


In [181]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('emperor', 0.8314059972763062),
 ('empress', 0.8250986933708191),
 ('son', 0.8157491683959961),
 ('prince', 0.8060941696166992),
 ('archbishop', 0.8003251552581787)]

In [191]:
%%time
model = word2vec.Word2Vec(dataset, size=100)

Wall time: 1min 4s


### Using skipgram method

In [117]:
%%time
model_sg = word2vec.Word2Vec(dataset, sg=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Wall time: 6min 31s


#### Rare terms - oeuvre

In [126]:
model_sg.wv.most_similar("oeuvre", topn=5)

[('masterful', 0.8347533345222473),
 ('orchestration', 0.8149941563606262),
 ('mussorgsky', 0.8116796016693115),
 ('showcasing', 0.8080146312713623),
 ('lithographs', 0.805435299873352)]

In [122]:
model.wv.most_similar("oeuvre", topn=5)

[('baglione', 0.7203884124755859),
 ('chateaubriand', 0.7119786143302917),
 ('kurosawa', 0.6956337690353394),
 ('swinburne', 0.6926312446594238),
 ('poetess', 0.6910216808319092)]

### Effect of training data

In [196]:
from nltk.corpus import brown, movie_reviews

In [200]:
%%time
model_brown = word2vec.Word2Vec(brown.sents(), sg=1)
model_movie = word2vec.Word2Vec(movie_reviews.sents(), sg=1)

Wall time: 40 s


In [201]:
model_brown.wv.most_similar('money', topn=5)

[('job', 0.8477444648742676),
 ('care', 0.8424298763275146),
 ('friendship', 0.8394286632537842),
 ('risk', 0.8268661499023438),
 ('permission', 0.8243911862373352)]

In [202]:
model_movie.wv.most_similar('money', topn=5)

[('cash', 0.7299771904945374),
 ('ransom', 0.7130625247955322),
 ('record', 0.7028014063835144),
 ('risk', 0.6977001428604126),
 ('paid', 0.6940697431564331)]

### Using pre-trained GloVe embeddings

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

In [203]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

In [207]:
glove_model.most_similar("money", topn=5)

[('funds', 0.8508071899414062),
 ('cash', 0.848483681678772),
 ('fund', 0.7594833374023438),
 ('paying', 0.7415367364883423),
 ('pay', 0.740767240524292)]

In [206]:
glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755737066268921),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520533561706543)]

### Bias in word embeddings

In [82]:
model.wv.most_similar(positive=['woman', 'doctor'], negative=['man'], topn=5)

[('nurse', 0.6464251279830933),
 ('child', 0.5847542881965637),
 ('teacher', 0.569127082824707),
 ('detective', 0.5451491475105286),
 ('bachelor', 0.521796464920044)]

In [83]:
model.wv.most_similar(positive=['woman', 'smart'], negative=['man'], topn=5)

[('pet', 0.5676326751708984),
 ('lingerie', 0.5665134191513062),
 ('dominatrix', 0.5625378489494324),
 ('daisy', 0.5616745352745056),
 ('boomer', 0.5598588585853577)]

### Activity 4.01

In [58]:
import nltk

In [10]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [59]:
alice_raw = nltk.corpus.gutenberg.raw('carroll-alice.txt')

In [60]:
alice_raw[:800]

"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit"

#### Solution

1. Lower case and separate into sentences

In [61]:
txt_sents = tokenize.sent_tokenize(alice_raw.lower())

2. Tokenize the sentences

In [62]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

3. Import punctuations from string module and stop words from NLTK

In [63]:
from string import punctuation
stop_punct = list(punctuation)

from nltk.corpus import stopwords
stop_nltk = stopwords.words("english")

4. Create a variable holding the contextual stop words - "--" and "said"

In [64]:
stop_context = ["--", "said"]

5. Create a master list for stop words to remove, containing terms from punctuation, NLTK stop words and contextual stop words.

In [65]:
stop_final = stop_punct + stop_nltk + stop_context

6. Define a function to drop these tokens from any input sentence (tokenized)

In [66]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

7. Remove from the tokenized text

In [67]:
alice_words_nostop = [drop_stop(sent) for sent in txt_words]
print(alice_words_nostop[:2])

[['alice', "'s", 'adventures', 'wonderland', 'lewis', 'carroll', '1865', 'chapter', 'i.', 'rabbit-hole', 'alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', "'and", 'use', 'book', 'thought', 'alice', "'without", 'pictures', 'conversation'], ['considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisy-chain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close']]


8. Using porter stemmer from NLTK, perform stemming on the result.  
Print out the first 5 sentences of the result.

In [68]:
from nltk.stem import PorterStemmer
stemmer_p = PorterStemmer()

In [69]:
alice_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in alice_words_nostop]

In [70]:
print(alice_words_stem[:5])

[['alic', "'s", 'adventur', 'wonderland', 'lewi', 'carrol', '1865', 'chapter', 'i.', 'rabbit-hol', 'alic', 'begin', 'get', 'tire', 'sit', 'sister', 'bank', 'noth', 'twice', 'peep', 'book', 'sister', 'read', 'pictur', 'convers', "'and", 'use', 'book', 'thought', 'alic', "'without", 'pictur', 'convers'], ['consid', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepi', 'stupid', 'whether', 'pleasur', 'make', 'daisy-chain', 'would', 'worth', 'troubl', 'get', 'pick', 'daisi', 'suddenli', 'white', 'rabbit', 'pink', 'eye', 'ran', 'close'], ['noth', 'remark', 'alic', 'think', 'much', 'way', 'hear', 'rabbit', 'say', "'oh", 'dear'], ['oh', 'dear'], ['shall', 'late']]


### Activity 4.02
#### Solution

1. From activity 4.01, print the first 3 sentences from the result after stop word removal. This is the data you will work with.

In [71]:
print(alice_words_nostop[:3])

[['alice', "'s", 'adventures', 'wonderland', 'lewis', 'carroll', '1865', 'chapter', 'i.', 'rabbit-hole', 'alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', "'and", 'use', 'book', 'thought', 'alice', "'without", 'pictures', 'conversation'], ['considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisy-chain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close'], ['nothing', 'remarkable', 'alice', 'think', 'much', 'way', 'hear', 'rabbit', 'say', "'oh", 'dear']]


2. Import word2vec from Gensim and train your word embeddings with default parameters

In [72]:
from gensim.models import word2vec

In [73]:
%%time
model = word2vec.Word2Vec(alice_words_nostop)



Wall time: 575 ms


3. Find the terms most similar to rabbit

In [74]:
model.wv.most_similar("rabbit", topn=5)

[('little', 0.9947453737258911),
 ('alice', 0.9946815967559814),
 ('could', 0.9945167303085327),
 ("'s", 0.9943983554840088),
 ("n't", 0.9943609833717346)]

4.	Using a window size 2, retrain the word vectors.

In [75]:
%%time
model = word2vec.Word2Vec(alice_words_nostop, window=2)



Wall time: 103 ms


5. Find the terms most similar to rabbit

In [76]:
model.wv.most_similar("rabbit", topn=5)

[('alice', 0.9301280975341797),
 ('little', 0.9263476729393005),
 ('could', 0.9247009754180908),
 ("'s", 0.9223048686981201),
 ("n't", 0.9217658638954163)]

6.	Retrain word vectors using skipgram method, window size of 5.

In [77]:
%%time
model = word2vec.Word2Vec(alice_words_nostop, window=5, sg=1)



Wall time: 316 ms


7. Find the terms most similar to rabbit

In [78]:
model.wv.most_similar("rabbit", topn=5)

[('hands', 0.9995781183242798),
 ('gloves', 0.9995264410972595),
 ('mind', 0.9995194673538208),
 ('word', 0.9995113611221313),
 ('find', 0.9994960427284241)]

8.	Find the representation for the phrase “white rabbit” by averaging the vectors for “white” and “rabbit”

In [79]:
v1 = model.wv['white']
v2 = model.wv['rabbit']
res1 = (v1+v2)/2

9.	Find the representation for “mad hatter” by averaging the vectors for “mad” and “hatter”

In [80]:
v1 = model.wv['mad']
v2 = model.wv['hatter']
res2 = (v1+v2)/2

10.	Find the cosine similarity between these two phrases

In [81]:
model.wv.cosine_similarities(res1, [res2])

array([0.999521], dtype=float32)

11.	Load pre-trained GloVe embeddings of size 100D using the formatted keyed vectors

In [58]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


12.	Find representations for “white rabbit” and “mad hatter”. 

In [60]:
v1 = glove_model['white']
v2 = glove_model['rabbit']
res1 = (v1+v2)/2

v1 = glove_model['mad']
v2 = glove_model['hatter']
res2 = (v1+v2)/2

13.	Find the cosine similarity between the two phrases. Has the cosine similarity changed?

In [62]:
glove_model.cosine_similarities(res1, [res2])

array([0.4514557], dtype=float32)

We see that the cosine similarity between the two phrases “mad hatter” and “white rabbit” is far lower from the GloVe model. This because the GloVe model hasn’t probably seen the terms together as in the book.