In [1]:
import nltk

para = """Dr. A P J Abdul Kalam was born to a poor Tamil Muslim family. He lived with his family in the temple city of Tamilnadu, Rameswaram, where his father, Jainulabdeen, had a boat and was an imam of a local mosque. At the same time, his mother, Ashiamma, was a housewife. Kalam had four brothers and one sister in his family, from which he was the youngest. Kalam's ancestors were wealthy traders and landowners and had vast land and property tracts. But with time, their business of ferrying pilgrims and trading groceries suffered huge losses due to the Pamban Bridge's opening. As a result, Kalam's family had become inadequate and struggled hard to make a living. At a tender age, Kalam had to sell newspapers to supplement his family income."""

In [20]:
sentences = nltk.sent_tokenize(para)

In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [12]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
stemmer = PorterStemmer()

In [14]:
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in stopwords.words('English')]
    sentences[i] = ' '.join(words)

In [15]:
sentences


['dr. a p j abdul kalam born poor tamil muslim famili .',
 'he live famili templ citi tamilnadu , rameswaram , father , jainulabdeen , boat imam local mosqu .',
 'at time , mother , ashiamma , housewif .',
 'kalam four brother one sister famili , youngest .',
 "kalam 's ancestor wealthi trader landown vast land properti tract .",
 "but time , busi ferri pilgrim trade groceri suffer huge loss due pamban bridg 's open .",
 "as result , kalam 's famili becom inadequ struggl hard make live .",
 'at tender age , kalam sell newspap supplement famili incom .']

In [21]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('English')]
    sentences[i] = ' '.join(words)

In [22]:
sentences

['Dr. A P J Abdul Kalam born poor Tamil Muslim family .',
 'He lived family temple city Tamilnadu , Rameswaram , father , Jainulabdeen , boat imam local mosque .',
 'At time , mother , Ashiamma , housewife .',
 'Kalam four brother one sister family , youngest .',
 "Kalam 's ancestor wealthy trader landowner vast land property tract .",
 "But time , business ferrying pilgrim trading grocery suffered huge loss due Pamban Bridge 's opening .",
 "As result , Kalam 's family become inadequate struggled hard make living .",
 'At tender age , Kalam sell newspaper supplement family income .']

In [26]:
import re

sent = nltk.sent_tokenize(para)
corpus = []
for i in range(len(sent)):
    review = re.sub('[^a-zA-Z]', ' ', sent[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

In [27]:
corpus

['dr p j abdul kalam born poor tamil muslim family',
 'lived family temple city tamilnadu rameswaram father jainulabdeen boat imam local mosque',
 'time mother ashiamma housewife',
 'kalam four brother one sister family youngest',
 'kalam ancestor wealthy trader landowner vast land property tract',
 'time business ferrying pilgrim trading grocery suffered huge loss due pamban bridge opening',
 'result kalam family become inadequate struggled hard make living',
 'tender age kalam sell newspaper supplement family income']

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [29]:
x

array([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
x = cv.fit_transform(corpus).toarray()

In [31]:
x

array([[0.38836637, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38836637, 0.        , 0.        , 0.        ,
        0.        , 0.38836637, 0.        , 0.21797864, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.21797864, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.38836637, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38836637, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.38836637, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.29728449, 0.        , 0.        , 0.        , 0.        ,
        0.29728449, 0.     

In [32]:
from gensim.models import Word2Vec

In [33]:
#preprocessing the data

text = re.sub(r'\[[0-9]*\]', ' ', para)
text = re.sub(r'\s+', ' ', text)
text = text.lower()
text = re.sub(r'\d', ' ', text)
text = re.sub(r'\s+', ' ', text)


In [35]:
sentences = nltk.sent_tokenize(text)

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]



In [47]:
model = Word2Vec(sentences, min_count=1)

words = model.wv.index_to_key

vector = model.wv['family']

similar = model.wv.most_similar('family')

In [48]:
vector

array([-8.2370406e-03,  9.3296450e-03, -1.9822012e-04, -1.9443157e-03,
        4.5740190e-03, -4.1388380e-03,  2.7949121e-03,  6.9759577e-03,
        6.0637575e-03, -7.5613828e-03,  9.3733668e-03,  4.6398924e-03,
        3.9736610e-03, -6.2306989e-03,  8.4812650e-03, -2.1616959e-03,
        8.8310391e-03, -5.3536729e-03, -8.1465244e-03,  6.7958734e-03,
        1.7084298e-03, -2.1846087e-03,  9.5213559e-03,  9.4996300e-03,
       -9.7850636e-03,  2.4978202e-03,  6.1178165e-03,  3.8467243e-03,
        2.0034800e-03,  4.2993523e-04,  7.0274691e-04, -3.7919104e-03,
       -7.1029416e-03, -2.1304097e-03,  3.9437804e-03,  8.8379364e-03,
        9.2822891e-03, -5.9485766e-03, -9.4395094e-03,  9.7605055e-03,
        3.4402939e-03,  5.1469058e-03,  6.2662507e-03, -2.8039396e-03,
        7.3420065e-03,  2.7846354e-03,  2.8590383e-03, -2.4119117e-03,
       -3.0838603e-03, -2.3656725e-03,  4.2855805e-03,  4.0207913e-05,
       -9.5755244e-03, -9.7091943e-03, -6.1553055e-03, -1.2290425e-04,
      

In [49]:
similar

[('mother', 0.17878393828868866),
 ('newspapers', 0.16436806321144104),
 ('become', 0.14979305863380432),
 ('pilgrims', 0.13559643924236298),
 ('time', 0.13246169686317444),
 ('business', 0.12260954082012177),
 ('pamban', 0.09738108515739441),
 ('dr.', 0.0775262638926506),
 ('hard', 0.07751474529504776),
 ('vast', 0.07702312618494034)]