In [49]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))

In [50]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. 
NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

### Tokenization

In [51]:
import nltk
nltk.download('punkt')
from nltk import tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
tokenize.sent_tokenize(raw_txt)

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [53]:
txt_sents = tokenize.sent_tokenize(raw_txt)

In [55]:
type(txt_sents), len(txt_sents)

(list, 4)

In [56]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
type(txt_words), type(txt_words[0])

(list, list)

In [57]:
print(txt_words[:2])

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Normalizing case

In [58]:
#You needn't run this
raw_txt = raw_txt.lower()

In [59]:
txt_sents = [sent.lower() for sent in txt_sents]
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [60]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [61]:
print(txt_words[:2])

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp', '!'], ['we', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Removing punctuation

In [62]:
from string import punctuation

In [63]:
list_punct = list(punctuation)
print(list_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [64]:
def drop_punct(input_tokens):
    return [token for token in input_tokens if token not in list_punct]

In [65]:
drop_punct(["let",".","us",".","go","!"])

['let', 'us', 'go']

In [66]:
txt_words_nopunct = [drop_punct(sent) for sent in txt_words]
print(txt_words_nopunct)

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp'], ['we', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['nlp', 'is', 'amazing', 'and', 'deep', 'learning', 'makes', 'it', 'even', 'more', 'fun'], ['let', "'s", 'learn']]


#### Removing stop words

In [67]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
from nltk.corpus import stopwords
list_stop = stopwords.words("english")
len(list_stop)

179

In [69]:
print(list_stop[:50])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


### Exercise 4.01: Tokenizing, Case Normalization, Punctuation and Stop Word Removal

In [70]:
import nltk
from nltk import tokenize

In [71]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

In [72]:
txt_sents = tokenize.sent_tokenize(raw_txt.lower())
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

from string import punctuation
stop_punct = list(punctuation)

from nltk.corpus import stopwords
stop_nltk = stopwords.words("english")

stop_final = stop_punct + stop_nltk

In [73]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

In [74]:
txt_words_nostop = [drop_stop(sent) for sent in txt_words]

In [75]:
print(txt_words_nostop[0])

['welcome', 'world', 'deep', 'learning', 'nlp']


### Stemming

In [76]:
from nltk.stem import PorterStemmer

In [77]:
stemmer_p = PorterStemmer()

In [78]:
print(stemmer_p.stem("driving"))

drive


In [79]:
txt = "I mustered all my drive, drove to the driving school!"

In [80]:
tokens = tokenize.word_tokenize(txt)
print([stemmer_p.stem(word) for word in tokens])

['I', 'muster', 'all', 'my', 'drive', ',', 'drove', 'to', 'the', 'drive', 'school', '!']


### Lemmatization

In [81]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [82]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [83]:
lemmatizer.lemmatize("ponies")

'pony'

### Exercise 4.02: Stemming Our Data

In [84]:
from nltk.stem import PorterStemmer

In [85]:
stemmer_p = PorterStemmer()

In [86]:
print([stemmer_p.stem(token) for token in txt_words_nostop[0]])

['welcom', 'world', 'deep', 'learn', 'nlp']


Applying stemmer to all the sentences

In [87]:
txt_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in txt_words_nostop]

In [88]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

### Downloading Text Corpora using NLTK

In [89]:
import nltk

In [44]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [90]:
alice_raw = nltk.corpus.gutenberg.raw('carroll-alice.txt')

In [91]:
alice_raw[:800]

"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit"

## Representation
### 1. One hot encoding

In [92]:
txt_words_nostop

[['welcome', 'world', 'deep', 'learning', 'nlp'],
 ["'re", 'together', "'ll", 'learn', 'together'],
 ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'],
 ['let', "'s", 'learn']]

### Exercise 4.03: Creating One-Hot Encoding for Our Data

In [93]:
print(txt_words_nostop)

[['welcome', 'world', 'deep', 'learning', 'nlp'], ["'re", 'together', "'ll", 'learn', 'together'], ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'], ['let', "'s", 'learn']]


In [94]:
target_terms = ["nlp","deep","learn"]

In [95]:
def get_onehot(sent):
    return [1 if term in  sent else 0 for term in target_terms]

In [96]:
one_hot_mat = [get_onehot(sent) for sent in txt_words_nostop]

In [97]:
import numpy as np

In [98]:
np.array(one_hot_mat)

array([[1, 1, 0],
       [0, 0, 1],
       [1, 1, 0],
       [0, 0, 1]])

### Term Frequencies

In [99]:
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
vectorizer = CountVectorizer(max_features = 5)

In [101]:
vectorizer.fit(txt_sents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [102]:
vectorizer.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [103]:
txt_dtm = vectorizer.fit_transform(txt_sents)

In [104]:
txt_dtm.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 2, 2],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [105]:
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [106]:
def do_nothing(doc):
    return doc

In [107]:
vectorizer = CountVectorizer(max_features=5, 
                             preprocessor=do_nothing, 
                             tokenizer=do_nothing)

In [108]:
txt_dtm = vectorizer.fit_transform(txt_words_stem)

In [109]:
txt_dtm.toarray()

array([[0, 1, 1, 1, 0],
       [1, 0, 1, 0, 2],
       [0, 1, 1, 1, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [110]:
vectorizer.vocabulary_

{'deep': 1, 'learn': 2, 'nlp': 3, 'togeth': 4, "'ll": 0}

In [111]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

### Exercise 4.04: Document Term Matrix with TF-IDF

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [113]:
vectorizer_tfidf = TfidfVectorizer(max_features=5)

In [114]:
vectorizer_tfidf.fit(txt_sents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [115]:
vectorizer_tfidf.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [116]:
txt_tfidf = vectorizer_tfidf.transform(txt_sents)

In [117]:
txt_tfidf.toarray()

array([[0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.25932364, 0.        , 0.25932364, 0.65783832, 0.65783832],
       [0.70710678, 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ]])

In [118]:
vectorizer_tfidf.idf_

array([1.51082562, 1.51082562, 1.51082562, 1.91629073, 1.91629073])

## Training Our Own Embeddings

In [119]:
import gensim.downloader as api
from gensim.models import word2vec



In [77]:
#Another way of loading the data. if this doesn't work, you could use the text8 corpus local file
dataset = api.load("text8")



To ensure reproducible results, set random seed as 1

In [120]:
np.random.seed(1)

In [121]:
dataset = word2vec.Text8Corpus("text8")

In [122]:
model = word2vec.Word2Vec(dataset)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [123]:
print(model.wv["animal"])

[-1.354541    0.21255197 -1.2803084   1.5077848   0.778622   -0.3017604
 -0.76313347  1.2620598   0.33203518 -0.12264135 -0.8749983  -0.49952352
 -3.4714737   0.23336382 -1.5944424  -0.77313447  1.6004859   0.39750227
 -0.21382928  0.79679984  2.230377   -1.2451155  -0.23759073 -0.6826417
  0.06757552  1.1354308  -0.9048583  -1.0423294  -0.6095223  -0.8687781
 -1.5961674   0.94031745  0.82677937 -1.435147   -0.967001   -0.9331538
  1.2132032  -1.4953172  -1.0877444   1.2012008   0.16687249 -1.3708639
 -0.3737326  -2.015771    2.219466    0.43797886 -1.4105217   0.13743232
  0.4868426  -0.28345928 -0.560394    0.2742348   0.7965476  -0.35514706
 -0.51794916 -0.04570778  0.9425421  -2.2243745  -1.145821    0.05696366
 -0.3091545   1.5641677   1.1459892   0.5507111  -0.40103084 -1.0796953
 -1.1559533   0.62421596  2.1508386  -0.44318482  0.74150276 -3.4402597
 -0.7350121  -0.56858236  0.25781694  1.237234   -1.4612886   0.37760228
 -0.1176926  -0.5814898  -0.8529314  -1.0561575   0.55432 

In [124]:
len(model.wv["animal"])

100

In [125]:
model.wv.most_similar("animal")

[('insect', 0.7510448098182678),
 ('animals', 0.7423559427261353),
 ('aquatic', 0.6713050007820129),
 ('ants', 0.665005087852478),
 ('feces', 0.6597224473953247),
 ('humans', 0.6595240235328674),
 ('insects', 0.6531916856765747),
 ('domesticated', 0.639849066734314),
 ('mammal', 0.638308048248291),
 ('herd', 0.6336805820465088)]

In [126]:
model.wv.most_similar("happiness")

[('humanity', 0.7809507846832275),
 ('goodness', 0.7767032384872437),
 ('pleasure', 0.7601181268692017),
 ('mankind', 0.733366072177887),
 ('satisfaction', 0.7316511869430542),
 ('compassion', 0.7286831736564636),
 ('desires', 0.7183192372322083),
 ('feelings', 0.7128568887710571),
 ('perfection', 0.7075753808021545),
 ('salvation', 0.7067484855651855)]

### Semantic Regularities in Word Embeddings

In [127]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6851475238800049),
 ('prince', 0.6383140087127686),
 ('empress', 0.6180450916290283),
 ('princess', 0.6116641759872437),
 ('son', 0.6045335531234741)]

In [128]:
model.wv.most_similar(positive=['uncle', 'woman'], negative=['man'], topn=5)

[('aunt', 0.8399454951286316),
 ('grandmother', 0.8323032259941101),
 ('wife', 0.8157851099967957),
 ('niece', 0.8152226805686951),
 ('widow', 0.7859092950820923)]

### Exercise 4.05: Vectors for Phrases

In [129]:
v1 = model.wv['get']
v2 = model.wv['happy']
res1 = (v1+v2)/2

In [130]:
v1 = model.wv['make']
v2 = model.wv['merry']
res2 = (v1+v2)/2

In [131]:
model.wv.cosine_similarities(res1, [res2])

array([0.57630885], dtype=float32)

### Effect of Parameters - 'size' of the Vector

In [135]:
model = word2vec.Word2Vec(dataset, size=30)

In [136]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('son', 0.8133434653282166),
 ('empress', 0.8022229671478271),
 ('emperor', 0.7999867796897888),
 ('archbishop', 0.7950774431228638),
 ('constantine', 0.7858606576919556)]

### Effect of parameters - skipgram vs. CBOW

#### Rare terms - oeuvre

In [137]:
model = word2vec.Word2Vec(dataset)

In [138]:
model.wv.most_similar("oeuvre", topn=5)

[('seminal', 0.7173739671707153),
 ('baglione', 0.6992780566215515),
 ('wace', 0.6952950954437256),
 ('mockery', 0.6938953399658203),
 ('foxe', 0.687375545501709)]

In [139]:
model_sg = word2vec.Word2Vec(dataset, sg=1)

In [140]:
model_sg.wv.most_similar("oeuvre", topn=5)

[('masterful', 0.8323545455932617),
 ('satiric', 0.8200669288635254),
 ('masterwork', 0.815832257270813),
 ('mussorgsky', 0.815514862537384),
 ('librettos', 0.8108195662498474)]

### Exercise 4.06: Training Word Vectors on Different Datasets

In [144]:
nltk.download('brown')
nltk.download('movie_reviews')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [145]:
from nltk.corpus import brown, movie_reviews

In [146]:
model_brown = word2vec.Word2Vec(brown.sents(), sg=1)
model_movie = word2vec.Word2Vec(movie_reviews.sents(), sg=1)

In [147]:
model_brown.wv.most_similar('money', topn=5)

[('job', 0.8501682281494141),
 ('care', 0.8444021344184875),
 ('friendship', 0.8370720744132996),
 ('permission', 0.8268598318099976),
 ('risk', 0.8242795467376709)]

In [148]:
model_movie.wv.most_similar('money', topn=5)

[('cash', 0.7291117310523987),
 ('ransom', 0.7190667986869812),
 ('paid', 0.6980520486831665),
 ('risk', 0.6944338083267212),
 ('record', 0.6919084787368774)]

### Using Pre-Trained Word Vectors

In [103]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [104]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

In [105]:
glove_model.most_similar("money", topn=5)

[('funds', 0.8508071899414062),
 ('cash', 0.848483681678772),
 ('fund', 0.7594833374023438),
 ('paying', 0.7415367364883423),
 ('pay', 0.740767240524292)]

In [106]:
glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755737066268921),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520533561706543)]

### Bias in Embeddings – A Word of Caution

In [149]:
model.wv.most_similar(positive=['woman', 'doctor'], negative=['man'], topn=5)

[('child', 0.6149958372116089),
 ('nurse', 0.6090491414070129),
 ('teacher', 0.5878923535346985),
 ('dominatrix', 0.5384681224822998),
 ('detective', 0.5246642231941223)]

In [150]:
model.wv.most_similar(positive=['woman', 'smart'], negative=['man'], topn=5)

[('pet', 0.6097452640533447),
 ('odie', 0.567996621131897),
 ('lingerie', 0.5643869042396545),
 ('scam', 0.5464061498641968),
 ('thug', 0.5415985584259033)]

### Activity 4.01: Text Preprocessing of the 'Alice in Wonderland' Text

In [112]:
alice_raw[:800]

"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit"

#### Solution

In [152]:
txt_sents = tokenize.sent_tokenize(alice_raw.lower())

In [153]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [154]:
from string import punctuation
stop_punct = list(punctuation)

from nltk.corpus import stopwords
stop_nltk = stopwords.words("english")

In [155]:
stop_context = ["--", "said"]

In [156]:
stop_final = stop_punct + stop_nltk + stop_context

In [157]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

In [158]:
alice_words_nostop = [drop_stop(sent) for sent in txt_words]
print(alice_words_nostop[:2])

[['alice', "'s", 'adventures', 'wonderland', 'lewis', 'carroll', '1865', 'chapter', 'i.', 'rabbit-hole', 'alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', "'and", 'use', 'book', 'thought', 'alice', "'without", 'pictures', 'conversation'], ['considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisy-chain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close']]


In [159]:
from nltk.stem import PorterStemmer
stemmer_p = PorterStemmer()

In [160]:
alice_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in alice_words_nostop]

In [161]:
print(alice_words_stem[:5])

[['alic', "'s", 'adventur', 'wonderland', 'lewi', 'carrol', '1865', 'chapter', 'i.', 'rabbit-hol', 'alic', 'begin', 'get', 'tire', 'sit', 'sister', 'bank', 'noth', 'twice', 'peep', 'book', 'sister', 'read', 'pictur', 'convers', "'and", 'use', 'book', 'thought', 'alic', "'without", 'pictur', 'convers'], ['consid', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepi', 'stupid', 'whether', 'pleasur', 'make', 'daisy-chain', 'would', 'worth', 'troubl', 'get', 'pick', 'daisi', 'suddenli', 'white', 'rabbit', 'pink', 'eye', 'ran', 'close'], ['noth', 'remark', 'alic', 'think', 'much', 'way', 'hear', 'rabbit', 'say', "'oh", 'dear'], ['oh', 'dear'], ['shall', 'late']]


### Activity 4.02: Text Representation for Alice in Wonderland
#### Solution

In [162]:
#From activity 4.01, print the first 3 sentences from the result after stop word removal. This is the data you will work with.
print(alice_words_nostop[:3])

[['alice', "'s", 'adventures', 'wonderland', 'lewis', 'carroll', '1865', 'chapter', 'i.', 'rabbit-hole', 'alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', "'and", 'use', 'book', 'thought', 'alice', "'without", 'pictures', 'conversation'], ['considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisy-chain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close'], ['nothing', 'remarkable', 'alice', 'think', 'much', 'way', 'hear', 'rabbit', 'say', "'oh", 'dear']]


In [163]:
from gensim.models import word2vec

In [164]:
model = word2vec.Word2Vec(alice_words_nostop)

In [165]:
model.wv.most_similar("rabbit", topn=5)

[('alice', 0.9963310360908508),
 ('little', 0.9956872463226318),
 ('went', 0.9955698251724243),
 ("'s", 0.9955658912658691),
 ('would', 0.9954401254653931)]

In [166]:
model = word2vec.Word2Vec(alice_words_nostop, window=2)

In [167]:
model.wv.most_similar("rabbit", topn=5)

[('alice', 0.9491485357284546),
 ("'s", 0.9364748001098633),
 ('little', 0.9345826506614685),
 ('large', 0.9341927170753479),
 ('duchess', 0.9341296553611755)]

In [168]:
model = word2vec.Word2Vec(alice_words_nostop, window=5, sg=1)

In [169]:
model.wv.most_similar("rabbit", topn=5)

[('gardeners', 0.9995723366737366),
 ('end', 0.9995588064193726),
 ('came', 0.9995309114456177),
 ('sort', 0.9995298385620117),
 ('upon', 0.9995272159576416)]

In [170]:
v1 = model.wv['white']
v2 = model.wv['rabbit']
res1 = (v1+v2)/2

In [171]:
v1 = model.wv['mad']
v2 = model.wv['hatter']
res2 = (v1+v2)/2

In [172]:
model.wv.cosine_similarities(res1, [res2])

array([0.999662], dtype=float32)

In [134]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

In [135]:
v1 = glove_model['white']
v2 = glove_model['rabbit']
res1 = (v1+v2)/2

v1 = glove_model['mad']
v2 = glove_model['hatter']
res2 = (v1+v2)/2

In [136]:
glove_model.cosine_similarities(res1, [res2])

array([0.4514557], dtype=float32)

We see that the cosine similarity between the two phrases “mad hatter” and “white rabbit” is far lower from the GloVe model. This because the GloVe model hasn’t probably seen the terms together as in the book.