In [6]:
import nltk

from gensim.models import Word2Vec
from nltk.corpus import stopwords

import re

In [7]:
paragraph = """The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
statistical natural language processing (NLP) for English written in the Python programming language. It supports 
classification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities.[4] It was developed 
by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.
[5] NLTK includes graphical demonstrations and sample data. It is accompanied by a book that explains the underlying 
concepts behind the language processing tasks supported by the toolkit,[6] plus a cookbook.[7]

NLTK is intended to support research and teaching in NLP or closely related areas, including empirical linguistics, 
cognitive science, artificial intelligence, information retrieval, and machine learning.[8] NLTK has been used 
successfully as a teaching tool, as an individual study tool, and as a platform for prototyping and building research 
systems. There are 32 universities in the US and 25 countries using NLTK in their courses.
"""

In [8]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

In [9]:
text

'the natural language toolkit, or more commonly nltk, is a suite of libraries and programs for symbolic and statistical natural language processing (nlp) for english written in the python programming language. it supports classification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities. it was developed by steven bird and edward loper in the department of computer and information science at the university of pennsylvania. nltk includes graphical demonstrations and sample data. it is accompanied by a book that explains the underlying concepts behind the language processing tasks supported by the toolkit, plus a cookbook. nltk is intended to support research and teaching in nlp or closely related areas, including empirical linguistics, cognitive science, artificial intelligence, information retrieval, and machine learning. nltk has been used successfully as a teaching tool, as an individual study tool, and as a platform for prototyping and building researc

In [10]:
# Preparing the dataset
sentences = nltk.sent_tokenize(text)

In [11]:
sentences

['the natural language toolkit, or more commonly nltk, is a suite of libraries and programs for symbolic and statistical natural language processing (nlp) for english written in the python programming language.',
 'it supports classification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities.',
 'it was developed by steven bird and edward loper in the department of computer and information science at the university of pennsylvania.',
 'nltk includes graphical demonstrations and sample data.',
 'it is accompanied by a book that explains the underlying concepts behind the language processing tasks supported by the toolkit, plus a cookbook.',
 'nltk is intended to support research and teaching in nlp or closely related areas, including empirical linguistics, cognitive science, artificial intelligence, information retrieval, and machine learning.',
 'nltk has been used successfully as a teaching tool, as an individual study tool, and as a platform for prototy

In [12]:
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [13]:
sentences

[['the',
  'natural',
  'language',
  'toolkit',
  ',',
  'or',
  'more',
  'commonly',
  'nltk',
  ',',
  'is',
  'a',
  'suite',
  'of',
  'libraries',
  'and',
  'programs',
  'for',
  'symbolic',
  'and',
  'statistical',
  'natural',
  'language',
  'processing',
  '(',
  'nlp',
  ')',
  'for',
  'english',
  'written',
  'in',
  'the',
  'python',
  'programming',
  'language',
  '.'],
 ['it',
  'supports',
  'classification',
  ',',
  'tokenization',
  ',',
  'stemming',
  ',',
  'tagging',
  ',',
  'parsing',
  ',',
  'and',
  'semantic',
  'reasoning',
  'functionalities',
  '.'],
 ['it',
  'was',
  'developed',
  'by',
  'steven',
  'bird',
  'and',
  'edward',
  'loper',
  'in',
  'the',
  'department',
  'of',
  'computer',
  'and',
  'information',
  'science',
  'at',
  'the',
  'university',
  'of',
  'pennsylvania',
  '.'],
 ['nltk',
  'includes',
  'graphical',
  'demonstrations',
  'and',
  'sample',
  'data',
  '.'],
 ['it',
  'is',
  'accompanied',
  'by',
  'a',
  

In [14]:
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [15]:
sentences

[['natural',
  'language',
  'toolkit',
  ',',
  'commonly',
  'nltk',
  ',',
  'suite',
  'libraries',
  'programs',
  'symbolic',
  'statistical',
  'natural',
  'language',
  'processing',
  '(',
  'nlp',
  ')',
  'english',
  'written',
  'python',
  'programming',
  'language',
  '.'],
 ['supports',
  'classification',
  ',',
  'tokenization',
  ',',
  'stemming',
  ',',
  'tagging',
  ',',
  'parsing',
  ',',
  'semantic',
  'reasoning',
  'functionalities',
  '.'],
 ['developed',
  'steven',
  'bird',
  'edward',
  'loper',
  'department',
  'computer',
  'information',
  'science',
  'university',
  'pennsylvania',
  '.'],
 ['nltk', 'includes', 'graphical', 'demonstrations', 'sample', 'data', '.'],
 ['accompanied',
  'book',
  'explains',
  'underlying',
  'concepts',
  'behind',
  'language',
  'processing',
  'tasks',
  'supported',
  'toolkit',
  ',',
  'plus',
  'cookbook',
  '.'],
 ['nltk',
  'intended',
  'support',
  'research',
  'teaching',
  'nlp',
  'closely',
  'rel

In [16]:
# Training the Word2Vec model
model = Word2Vec(sentences, min_count = 1)

In [21]:
words = model.wv.index_to_key

In [22]:
words

[',',
 '.',
 'nltk',
 'language',
 'natural',
 'tool',
 'nlp',
 'teaching',
 'processing',
 'research',
 'information',
 'science',
 'toolkit',
 'functionalities',
 'tagging',
 'parsing',
 'semantic',
 'reasoning',
 'computer',
 'developed',
 'department',
 'bird',
 'university',
 'edward',
 'loper',
 'steven',
 'supports',
 'stemming',
 '(',
 'commonly',
 'suite',
 'libraries',
 'programs',
 'symbolic',
 'statistical',
 ')',
 'tokenization',
 'english',
 'written',
 'python',
 'programming',
 'includes',
 'classification',
 'pennsylvania',
 'courses',
 'graphical',
 'using',
 'cognitive',
 'artificial',
 'intelligence',
 'retrieval',
 'machine',
 'learning',
 'used',
 'successfully',
 'individual',
 'study',
 'platform',
 'prototyping',
 'building',
 'systems',
 'universities',
 'us',
 'countries',
 'linguistics',
 'empirical',
 'including',
 'behind',
 'sample',
 'data',
 'accompanied',
 'book',
 'explains',
 'underlying',
 'concepts',
 'tasks',
 'areas',
 'supported',
 'plus',
 'coo

In [23]:
# Finding word vectors of words
vector = model.wv['nlp']

In [24]:
vector

array([ 8.1726359e-03, -4.4368720e-03, -1.0604985e-03,  1.0047242e-03,
       -1.2601112e-04,  1.0813555e-03,  6.1618220e-03,  5.6514840e-05,
       -3.2469102e-03, -1.5419513e-03,  5.9045628e-03,  1.4599644e-03,
       -7.3814840e-04,  9.3515525e-03, -4.9035028e-03, -8.4453251e-04,
        9.1982149e-03,  6.7283651e-03,  1.5420523e-03, -8.9206202e-03,
        1.2204704e-03, -2.2575150e-03,  9.3878973e-03,  1.1855743e-03,
        1.4732252e-03,  2.4094405e-03, -1.8716689e-03, -4.9765739e-03,
        2.4388260e-04, -2.0583884e-03,  6.6065439e-03,  8.9358194e-03,
       -6.2980177e-04,  2.9127258e-03, -6.1295796e-03,  1.7528301e-03,
       -6.8577281e-03, -8.7177018e-03, -5.9367321e-03, -8.9980662e-03,
        7.2956597e-03, -5.7945363e-03,  8.3071953e-03, -7.2237873e-03,
        3.4168113e-03,  9.7098676e-03, -7.8436257e-03, -9.9321278e-03,
       -4.2758929e-03, -2.6890978e-03, -2.4801286e-04, -8.8512450e-03,
       -8.6515909e-03,  2.8155039e-03, -8.2360683e-03, -9.0807872e-03,
      

In [25]:
len(vector)

100

In [26]:
# Most similar words
similar = model.wv.most_similar('nlp')

In [27]:
similar

[('english', 0.3501066565513611),
 ('(', 0.3038422167301178),
 ('cognitive', 0.25153568387031555),
 ('underlying', 0.2241838127374649),
 ('supports', 0.1774691939353943),
 ('semantic', 0.16476021707057953),
 ('steven', 0.16339156031608582),
 ('classification', 0.15500327944755554),
 ('us', 0.15340973436832428),
 ('nltk', 0.14661931991577148)]

In [28]:
model.wv.most_similar('machine')

[('science', 0.31321951746940613),
 ('countries', 0.2339349240064621),
 ('programs', 0.2239091396331787),
 ('.', 0.18436744809150696),
 ('reasoning', 0.17269477248191833),
 ('steven', 0.16309760510921478),
 ('symbolic', 0.15654703974723816),
 ('computer', 0.15599776804447174),
 ('bird', 0.12011037766933441),
 ('commonly', 0.11530475318431854)]

In [32]:
model.wv.most_similar('stemming')

[('processing', 0.2852904498577118),
 ('using', 0.27052703499794006),
 ('programming', 0.25975358486175537),
 ('prototyping', 0.25381964445114136),
 ('related', 0.2050858438014984),
 ('.', 0.1885765641927719),
 ('support', 0.16728731989860535),
 ('suite', 0.1421465426683426),
 ('areas', 0.14012593030929565),
 ('tagging', 0.10794893652200699)]