In [1]:
file = open('hamlet.txt')

text = file.read()

In [2]:
text



In [3]:
# Tokenize data: Handling punctuations and lowercasing the text 
from gensim.utils import simple_preprocess 

# preprocess the file to get a list of tokens 
token_list =[] 

for sentence in text.split('.'):   
    # the simple_preprocess function returns a list of each sentence 
    token_list.append(simple_preprocess(sentence, deacc = True))

print (token_list[:2])

[['the', 'tragedy', 'of', 'hamlet', 'prince', 'of', 'denmark', 'by', 'william', 'shakespeare', 'dramatis', 'personae', 'claudius', 'king', 'of', 'denmark'], ['marcellus', 'officer']]


In [4]:
len(token_list)

3134

In [8]:
# Import gensim corpora
from gensim import corpora 

# storing the extracted tokens into the dictionary 
my_dictionary = corpora.Dictionary(token_list) 

# print the dictionary 
print(my_dictionary)

Dictionary<4593 unique tokens: ['by', 'claudius', 'denmark', 'dramatis', 'hamlet']...>


In [11]:
# save your dictionary to disk 
my_dictionary.save('dictionary.dict')

In [12]:
# load back 
load_dict = corpora.Dictionary.load('dictionary.dict')

print(load_dict)

Dictionary<4593 unique tokens: ['by', 'claudius', 'denmark', 'dramatis', 'hamlet']...>


In [16]:
# Converting to a bag of word corpus 
BoW_corpus =[my_dictionary.doc2bow(sent, allow_update = True) for sent in token_list] 

print(BoW_corpus[:4])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(13, 1), (14, 1)], [(4, 1), (5, 1), (10, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2)], [(21, 1), (22, 1), (23, 1)]]


In [15]:
my_dictionary[1]

'claudius'

## BoW

In [17]:
# Word weight in Bag of Words corpus 
word_weight =[] 

for doc in BoW_corpus: 
    for id, freq in doc: 
        word_weight.append([my_dictionary[id], freq]) 
        
print(word_weight[:10])

[['by', 1], ['claudius', 1], ['denmark', 2], ['dramatis', 1], ['hamlet', 1], ['king', 1], ['of', 3], ['personae', 1], ['prince', 1], ['shakespeare', 1]]


## TF-IDF

In [18]:
from gensim.models import TfidfModel
import numpy as np 

# create TF-IDF model 
tfIdf = TfidfModel(BoW_corpus, smartirs ='ntc')

# TF-IDF Word Weight 
weight_tfidf =[] 
for doc in tfIdf[BoW_corpus]: 
    for id, tf_idf in doc: 
        weight_tfidf.append([my_dictionary[id], np.around(tf_idf, decimals = 3)]) 
print(weight_tfidf[:10])

[['by', 0.146], ['claudius', 0.31], ['denmark', 0.407], ['dramatis', 0.339], ['hamlet', 0.142], ['king', 0.117], ['of', 0.241], ['personae', 0.339], ['prince', 0.272], ['shakespeare', 0.339]]


In [21]:
for i in tfIdf[BoW_corpus]:
    print(i)

[(0, 0.14611556136953135), (1, 0.3102099580349746), (2, 0.407430679013141), (3, 0.33943575976316737), (4, 0.1420187569286905), (5, 0.11667518452572431), (6, 0.24057671763559949), (7, 0.33943575976316737), (8, 0.27157554963486896), (9, 0.33943575976316737), (10, 0.0650005436550299), (11, 0.29311395997047024), (12, 0.33943575976316737)]
[(13, 0.6249730104443809), (14, 0.78064635797273)]
[(4, 0.23710025958032205), (5, 0.1947891752884552), (10, 0.2170367648156227), (15, 0.11240012424128112), (16, 0.48935364242421997), (17, 0.45339527472717883), (18, 0.4691029399148117), (19, 0.34597213239749713), (20, 0.24761533021023527)]
[(21, 0.8389805462478708), (22, 0.2768771322460359), (23, 0.4684556506830026)]
[(4, 0.42656053895893686), (20, 0.2227389562877964), (24, 0.6946835626204423), (25, 0.5346477455108687)]
[(19, 0.6056604762417843), (20, 0.21673829302469957), (23, 0.5539220105213258), (26, 0.5285549225154625)]
[(27, 0.6702560604184771), (28, 0.7421299168422621)]
[(27, 0.6596050510286842), (29

[(10, 0.10590699073959381), (22, 0.1825157846961741), (75, 0.20063361643820005), (107, 0.40210456025207764), (135, 0.13776696928820412), (143, 0.15528824280602924), (206, 0.2213521167661434), (212, 0.24546378024487062), (225, 0.27901298770871064), (232, 0.3217221691164204), (713, 0.3217221691164204), (917, 0.3625777830269171), (2513, 0.42995949202481315)]
[(5, 1.0)]
[(94, 0.3921229298011144), (143, 0.3065594212741128), (168, 0.34297337075793866), (225, 0.5508083451416486), (1931, 0.5755348495317443)]
[(20, 0.1336312688470546), (22, 0.20185507468331437), (101, 0.21420293781234737), (135, 0.15236469503641376), (283, 0.40099636090786334), (309, 0.2846378501026632), (3305, 0.506324233055169), (3587, 0.6116521052024748)]
[(5, 1.0)]
[(67, 0.5562445602627026), (75, 0.32756448441401953), (179, 0.44003549442963535), (1407, 0.6242293339557156)]
[(1931, 1.0)]
[(22, 0.26618129299549154), (31, 0.46920009194425627), (135, 0.20091955377037063), (141, 0.5576069357608056), (234, 0.21628484984934682), (

## Word2Vec

In [25]:
# import Word2Vec model
from gensim.models import Word2Vec

# Create Word2vec object
model = Word2Vec(sentences=token_list,  # tokenized sentences
                 vector_size=100,  
                 window=5, 
                 min_count=1, 
                 workers=4,  
                 sg=1) # 1 for skip-gram and 0 for CBOW 

#Save model
model.save("word2vec.model")

# Load trained Word2Vec model
model = Word2Vec.load("word2vec.model")

# Generate vector
vector = model.wv['think']  # returns numpy array

print(vector)

[-0.17036754  0.20261322  0.01893564  0.10914765  0.08863426 -0.35416636
  0.2125166   0.5065321  -0.18059383 -0.22485453 -0.06889503 -0.18622866
 -0.06979592  0.0826834   0.21010059 -0.01791033  0.10220035 -0.16928415
 -0.21484405 -0.4564494   0.1611439   0.18910089  0.21993147 -0.22601527
 -0.07328767  0.04445507 -0.2668096  -0.12664106 -0.2964416   0.15099078
  0.19547893  0.0276814   0.05970272 -0.19697517 -0.13034236  0.12704147
 -0.01238197 -0.12515661 -0.10860863 -0.26182273  0.03424751 -0.17636137
 -0.1152153  -0.01289264  0.3180365   0.01280101 -0.0776719  -0.09403727
  0.09255537  0.10201543  0.06624265 -0.13308497 -0.04122142 -0.10627339
 -0.16403113  0.09055936  0.08565716  0.12634307 -0.3113052   0.0458459
  0.1031545   0.11851606  0.13172777  0.08975002 -0.14918964  0.16623536
  0.073085    0.14033175 -0.44008252  0.21118966 -0.17772616  0.16034119
  0.2394136   0.04946292  0.29083428  0.02950815 -0.064146    0.02663359
 -0.20979424 -0.03922927 -0.19185573 -0.0293501  -0.

In [26]:
# Finding most similar words
model.wv.most_similar('present')

[('second', 0.9975017309188843),
 ('given', 0.9974398016929626),
 ('exit', 0.997320294380188),
 ('twere', 0.9972872138023376),
 ('bear', 0.9972538948059082),
 ('soft', 0.9972319006919861),
 ('power', 0.9972299933433533),
 ('faith', 0.9971872568130493),
 ('joy', 0.9971283078193665),
 ('without', 0.9971087574958801)]

## Doc2Vec

In [27]:
documents=text.split(".")

documents[:5]

['THE TRAGEDY OF HAMLET, PRINCE OF DENMARK\n\n\nby William Shakespeare\n\n\n\nDramatis Personae\n\n  Claudius, King of Denmark',
 '\n  Marcellus, Officer',
 '\n  Hamlet, son to the former, and nephew to the present king',
 '\n  Polonius, Lord Chamberlain',
 '\n  Horatio, friend to Hamlet']

In [28]:
from collections import namedtuple

# Transform data (you can add more data preprocessing steps) 
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')

for i, text in enumerate(documents):
    words = text.lower().split() # tokenization
    tags = [i]
    docs.append(analyzedDocument(words, tags))

print(docs[:2])

[AnalyzedDocument(words=['the', 'tragedy', 'of', 'hamlet,', 'prince', 'of', 'denmark', 'by', 'william', 'shakespeare', 'dramatis', 'personae', 'claudius,', 'king', 'of', 'denmark'], tags=[0]), AnalyzedDocument(words=['marcellus,', 'officer'], tags=[1])]


In [29]:
from gensim.models import doc2vec

model = doc2vec.Doc2Vec(docs,
                        vector_size=100,
                        window=5, 
                        min_count=1, 
                        workers=4,
                        dm=0) # 0 for PV-DBOW and 1 for PV-DM 

vector=model.infer_vector(['the', 'tragedy', 'of', 'hamlet,', 'prince', 'of', 'denmark', 'by', 'william', 
                    'shakespeare', 'dramatis', 'personae', 'claudius,', 'king', 'of', 'denmark'])

print(vector)

[ 4.03867103e-03 -3.12294289e-02  1.26583591e-01  2.15409249e-02
 -4.79127318e-02 -2.49561202e-02  3.17672752e-02 -9.53452894e-04
 -8.78845006e-02 -1.14928320e-01 -1.52083769e-01  1.51180550e-01
  8.62696767e-02  5.91080114e-02  8.35783686e-03  5.39016984e-02
  2.22874492e-01  1.25273326e-02  7.22566769e-02  3.04128486e-03
 -7.70450979e-02  4.79807705e-02  4.27284278e-02  2.84597147e-02
  9.58364755e-02  3.63597572e-02 -2.03884847e-05  2.61863749e-02
 -2.30043288e-02 -4.96206209e-02 -4.98691536e-02 -4.34736982e-02
 -5.03856316e-03 -6.89185336e-02 -6.50856793e-02 -5.13014905e-02
 -2.59253420e-02  5.44160865e-02 -1.13228038e-02 -6.00370392e-03
 -2.31487975e-02  5.53954728e-02  9.68850497e-03  9.42682847e-02
  1.00261196e-02 -4.37629148e-02  1.76524356e-01 -9.55196545e-02
 -1.80669613e-02  1.96759418e-01  4.32726517e-02 -2.07235795e-02
  3.08653042e-02 -3.78412455e-02 -7.24920779e-02 -1.11723945e-01
 -1.01734810e-01  4.17804010e-02  4.75205816e-02 -8.58025327e-02
  3.22401449e-02 -7.53556

## Pre-trained Models

### Google's Word2Vec

In [None]:
# from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

print(model.wv['reforms'].shape)

## Stanford's Glove

In [8]:
import gensim.downloader as api

# Download the model and return as object ready for use
model_glove_twitter = api.load("glove-twitter-25")

# Print shape of the vector
print(model_glove_twitter['reforms'].shape)

(25,)


In [9]:
# Print vector for word 'reform' 
print(model_glove_twitter['reforms'])

[ 0.37207    0.91542   -1.6257    -0.15803    0.38455   -1.3252
 -0.74057   -2.095      1.0401    -0.0027519  0.33633   -0.085222
 -2.1703     0.91529    0.77599   -0.87018   -0.97346    0.68114
  0.71777   -0.99392    0.028837   0.24823   -0.50573   -0.44954
 -0.52987  ]


In [13]:
# Finding most similar words
model_glove_twitter.most_similar('policy')

[('policies', 0.9484813213348389),
 ('laws', 0.9322695732116699),
 ('government', 0.9285519123077393),
 ('immigration', 0.9231586456298828),
 ('funding', 0.9161454439163208),
 ('administration', 0.9158841967582703),
 ('commission', 0.9154731631278992),
 ('education', 0.9152790904045105),
 ('economy', 0.9147925972938538),
 ('financial', 0.9144569635391235)]

## FastText

In [6]:
# Import FastText 
from gensim.models import FastText

# Create FastText Model object
model = FastText(vector_size=50, window=3, min_count=1)  # instantiate

# Build Vocab
model.build_vocab(token_list)

# Train FastText model
model.train(token_list, total_examples=len(token_list), epochs=10)  # train

model.wv['policy']

array([-0.22674942, -0.09294754,  0.14550294, -0.06987163, -0.18477647,
        0.130833  , -0.15174048,  0.07807715,  0.02299331,  0.43650657,
        0.22083089,  0.00236979,  0.4044554 ,  0.06726567, -0.22947505,
        0.21555075,  0.03571462, -0.04566545, -0.01676383, -0.22957449,
       -0.263474  , -0.1483099 ,  0.22570193,  0.04479947,  0.04447418,
       -0.02249806, -0.33629802,  0.05331704, -0.28085294,  0.05511108,
        0.14065175,  0.15651222, -0.13810122,  0.47811136,  0.07956782,
        0.09449955, -0.22937675,  0.05994929, -0.06151745, -0.01543031,
        0.01794525,  0.25961375, -0.00359993,  0.15751313,  0.13239035,
       -0.29706815,  0.2514477 ,  0.01208868,  0.0757787 ,  0.23368363],
      dtype=float32)

In [14]:
# Finding most similar words
model.wv.most_similar('policy')

[('pious', 0.9999200105667114),
 ('heroes', 0.9999193549156189),
 ('found', 0.9999184012413025),
 ('rouse', 0.9999173879623413),
 ('clothes', 0.9999135732650757),
 ('surrender', 0.9999127984046936),
 ('judgment', 0.9999127388000488),
 ('her', 0.9999126195907593),
 ('clothe', 0.9999125599861145),
 ('judgments', 0.9999123811721802)]