#Tokenization

In [90]:
import nltk

In [92]:
paragraph = "It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it. Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — Albus Dumbledore"


In [93]:
nltk.download('punkt') #important package f or stemming

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [94]:
sentences = nltk.sent_tokenize(paragraph) #converting para to sentences
sentences

['It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it.',
 'Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well.',
 '— Albus Dumbledore']

In [95]:
words=nltk.word_tokenize(paragraph) #converting para to words
words

['It',
 'is',
 'a',
 'curious',
 'thing',
 ',',
 'Harry',
 ',',
 'but',
 'perhaps',
 'those',
 'who',
 'are',
 'best',
 'suited',
 'to',
 'power',
 'are',
 'those',
 'who',
 'have',
 'never',
 'sought',
 'it',
 '.',
 'Those',
 'who',
 ',',
 'like',
 'you',
 ',',
 'have',
 'leadership',
 'thrust',
 'upon',
 'them',
 ',',
 'and',
 'take',
 'up',
 'the',
 'mantle',
 'because',
 'they',
 'must',
 ',',
 'and',
 'find',
 'to',
 'their',
 'own',
 'surprise',
 'that',
 'they',
 'wear',
 'it',
 'well',
 '.',
 '—',
 'Albus',
 'Dumbledore']

#Stemming and Lemmatization

###Stemming:
#####Reducing words to their word stem

######history, historical = histori
######final, finally, finalized = fina

In [96]:
from nltk.stem import PorterStemmer #for stemmeing
from nltk.corpus import stopwords #stopwords removes words that put no value here, ex: on, from, the, etc

In [97]:
stemmer = PorterStemmer() #creates object

In [98]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:
for i in range(len(sentences)):
  words=nltk.word_tokenize(sentences[i]) #words will have a list of words/each sentence that is
  words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))] #removes the words of para that are present in stopwords and if the word doesnt belong to the stopword, then it gets stemmed
  sentences[i]=' '.join(words)

In [100]:
sentences

['It curiou thing , harri , perhap best suit power never sought .',
 'those , like , leadership thrust upon , take mantl must , find surpris wear well .',
 '— albu dumbledor']

##Lemmatizing
#####Reducing words to their word stem, but a meaningful word
######history, historical = history
######final, finally, finalized = final

In [101]:
from nltk.stem import WordNetLemmatizer #for stemmeing
from nltk.corpus import stopwords #stopwords removes words that put no value here, ex: on, from, the, etc

In [102]:
"""para->sentences->words"""

'para->sentences->words'

In [103]:
paragraph = "It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it. Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — Albus Dumbledore"


In [104]:
sentences = nltk.sent_tokenize(paragraph) #converting para to sentences
sentences

['It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it.',
 'Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well.',
 '— Albus Dumbledore']

In [105]:
lemmatizer = WordNetLemmatizer() #object creation


In [106]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [107]:
for i in range(len(sentences)):
  words=nltk.word_tokenize(sentences[i]) #words will have a list of words/each sentence that is
  words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))] #removes the words of para that are present in stopwords and if the word doesnt belong to the stopword, then it gets stemmed
  sentences[i]=' '.join(words)

In [108]:
sentences

['It curious thing , Harry , perhaps best suited power never sought .',
 'Those , like , leadership thrust upon , take mantle must , find surprise wear well .',
 '— Albus Dumbledore']

#Bag of Words
all words get equal weightage

In [109]:
import re #regular expression(re) for cleaning the texts->punctuation marks, capital letter etc

In [110]:
paragraph = "It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it. Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — Albus Dumbledore"


In [111]:
sentences = nltk.sent_tokenize(paragraph) #converting para to sentences
sentences

['It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it.',
 'Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well.',
 '— Albus Dumbledore']

In [112]:
corpus = []

In [113]:
for i in range(len(sentences)):
    words = re.sub('[^a-zA-Z]', ' ', sentences[i])
    words = words.lower()
    words = words.split()
    words = [stemmer.stem(word) for word in words if not word in set(stopwords.words('english'))]
    words = ' '.join(words)
    corpus.append(words)

In [114]:
corpus

['curiou thing harri perhap best suit power never sought',
 'like leadership thrust upon take mantl must find surpris wear well',
 'albu dumbledor']

In [116]:
#Bag of words feature extraction
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#TF-IDF
#####term frequency(TF)=number of repetition of words in a sentence/number of works in sentence
#####Inverse Document Frequency(IDF)=log(number of sentences/number of sentences containing that word
#####TF*IDF
importance of a word is preserved
cons: semantic info/word order is not stored

In [117]:
paragraph = "It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it. Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — Albus Dumbledore"


In [118]:
sentences = nltk.sent_tokenize(paragraph) #converting para to sentences
sentences

['It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it.',
 'Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well.',
 '— Albus Dumbledore']

In [119]:
corpus = []

In [120]:
for i in range(len(sentences)):
    words = re.sub('[^a-zA-Z]', ' ', sentences[i])
    words = words.lower()
    words = words.split()
    words = [stemmer.stem(word) for word in words if not word in set(stopwords.words('english'))]
    words = ' '.join(words)
    corpus.append(words)

In [121]:
corpus

['curiou thing harri perhap best suit power never sought',
 'like leadership thrust upon take mantl must find surpris wear well',
 'albu dumbledor']

In [122]:
#TFIDF feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
X

array([[0.        , 0.33333333, 0.33333333, 0.        , 0.        ,
        0.33333333, 0.        , 0.        , 0.        , 0.        ,
        0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.33333333,
        0.        , 0.        , 0.33333333, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30151134,
        0.        , 0.30151134, 0.30151134, 0.30151134, 0.30151134,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.30151134, 0.30151134, 0.        , 0.30151134, 0.30151134,
        0.30151134, 0.30151134],
       [0.70710678, 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

#Word2Vec
###Each and every word is represented as a vector of 32 or more dimensions
#####semantic info is preserved
#####relation between different words are also preserved
#####for huge data

In [129]:
from gensim.models import Word2Vec

In [130]:
paragraph = "It is a curious thing, Harry, but perhaps those who are best suited to power are those who have never sought it. Those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — Albus Dumbledore"


In [146]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)
text

'it is a curious thing, harry, but perhaps those who are best suited to power are those who have never sought it. those who, like you, have leadership thrust upon them, and take up the mantle because they must, and find to their own surprise that they wear it well. — albus dumbledore'

In [147]:
sentences = nltk.sent_tokenize(paragraph) #converting para to sentences
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
sentences

[['It',
  'is',
  'a',
  'curious',
  'thing',
  ',',
  'Harry',
  ',',
  'but',
  'perhaps',
  'those',
  'who',
  'are',
  'best',
  'suited',
  'to',
  'power',
  'are',
  'those',
  'who',
  'have',
  'never',
  'sought',
  'it',
  '.'],
 ['Those',
  'who',
  ',',
  'like',
  'you',
  ',',
  'have',
  'leadership',
  'thrust',
  'upon',
  'them',
  ',',
  'and',
  'take',
  'up',
  'the',
  'mantle',
  'because',
  'they',
  'must',
  ',',
  'and',
  'find',
  'to',
  'their',
  'own',
  'surprise',
  'that',
  'they',
  'wear',
  'it',
  'well',
  '.'],
 ['—', 'Albus', 'Dumbledore']]

In [153]:
model = Word2Vec(sentences,min_count=1)

In [154]:
words = model.wv.vocab
words

{',': <gensim.models.keyedvectors.Vocab at 0x7f89426bcd10>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f89426bc610>,
 'Albus': <gensim.models.keyedvectors.Vocab at 0x7f89426c7210>,
 'Dumbledore': <gensim.models.keyedvectors.Vocab at 0x7f89426c7250>,
 'Harry': <gensim.models.keyedvectors.Vocab at 0x7f89426bcd90>,
 'It': <gensim.models.keyedvectors.Vocab at 0x7f89426fb350>,
 'Those': <gensim.models.keyedvectors.Vocab at 0x7f89426bc310>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f89426fb610>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7f89426bc850>,
 'are': <gensim.models.keyedvectors.Vocab at 0x7f89426bc490>,
 'because': <gensim.models.keyedvectors.Vocab at 0x7f89426bcf90>,
 'best': <gensim.models.keyedvectors.Vocab at 0x7f89426bce10>,
 'but': <gensim.models.keyedvectors.Vocab at 0x7f89426bcdd0>,
 'curious': <gensim.models.keyedvectors.Vocab at 0x7f89426fba10>,
 'find': <gensim.models.keyedvectors.Vocab at 0x7f89426bccd0>,
 'have': <gensim.models.keyedvectors.Vocab at 0x7f8942

In [156]:
# Finding Word Vectors
vector = model.wv['Harry']
vector

array([ 3.9526876e-03,  1.5921223e-03, -1.3191490e-03, -1.8989183e-03,
        3.3463233e-03, -1.1378512e-03, -2.2288687e-03,  4.7575180e-05,
        3.6962344e-03,  2.8708293e-03, -1.5139026e-03, -4.5045237e-03,
        4.8050203e-04, -2.4802426e-03,  2.5282789e-03, -1.5259833e-03,
        2.0056148e-04, -9.7129692e-04, -1.9343839e-03, -2.9319010e-03,
        3.8353859e-03,  4.7365939e-03,  2.0140016e-03, -2.6226109e-03,
        1.0625650e-03,  4.2408104e-03, -3.1763522e-03, -2.2276549e-03,
       -7.5605931e-04, -4.0825736e-03, -1.4007360e-03,  3.4350662e-03,
        5.0195714e-04, -8.2094508e-04,  4.2661894e-03,  2.4861263e-03,
        2.5394286e-03, -4.1102073e-03, -3.7130830e-03, -3.0281932e-05,
        3.6196497e-03, -4.6496307e-03, -4.9489764e-03,  4.4391310e-04,
       -8.5361105e-05, -1.3290483e-03, -1.9469671e-03,  2.5937138e-03,
       -2.3872515e-03,  4.8614652e-03, -2.9620251e-03,  1.7729253e-03,
       -5.3148979e-04, -2.2150481e-03,  3.6895126e-03,  2.6991349e-03,
      

In [158]:
# Most similar words
similar = model.wv.most_similar('Albus')
similar

[('leadership', 0.19194930791854858),
 ('take', 0.17726856470108032),
 ('have', 0.15853197872638702),
 ('wear', 0.15545016527175903),
 ('sought', 0.13782142102718353),
 ('the', 0.12873026728630066),
 ('thrust', 0.12019222974777222),
 ('Harry', 0.11921392381191254),
 ('to', 0.1022951751947403),
 ('.', 0.0954347550868988)]