In [3]:
#! pip install nltk

In [4]:
#! pip show nltk

In [5]:
ls

nlp_TP1.ipynb


Import Libraries

In [6]:
import pandas as pd
import nltk
import re   # Regular expression
#nltk.download('punkt')  # Used for tokenisation
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('stopwords')

from nltk.corpus import stopwords   # used for stop words
from nltk.stem.porter import PorterStemmer  # Used for stemming
from nltk.stem.wordnet import WordNetLemmatizer # Used for Lemmatization 
from nltk.tokenize import sent_tokenize, word_tokenize 


Input text

In [7]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this." 

Tokenisation

In [8]:
# Sentence tokenize
print(sent_tokenize(text))
# Word tokenize
print(word_tokenize(text))

['Natural Language Processing is an exciting area.', 'Huge budget have been allocated for this.']
['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


Lower case convertion

In [9]:
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower()) 
words = text.split() 
print(words) 


['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


Stop words removal

In [10]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
final_words=[]
for w in words: 
    if w not in stopwords.words("english"):
        final_words.append(w)

In [12]:
words = [w for w in words if w not in stopwords.words("english")]

In [13]:
print(words)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


Stemming

In [14]:
stemmer = PorterStemmer()
stemmer.stem('allocated')

'alloc'

In [15]:
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']


Lemmatisation

In [16]:
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


Executing in a single cell

In [17]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this." 
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower()) # Removing special charaters and lowering 
words = text.split() # Tokenization activity executed 
stopText =[w for w in words if w not in stopwords.words("english")] # stopwords executed 
finalWords=[WordNetLemmatizer().lemmatize(w) for w in stopText] # Lemmatization executed 

Parts of speech

In [18]:
#nltk.download('averaged_perceptron_tagger') 


In [19]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this." 
tokenized = sent_tokenize(text) 
for i in tokenized:
    wordList = word_tokenize(i) 
    wordList = [w for w in wordList if w not in stopwords.words("english")] 
    tagged = nltk.pos_tag(wordList) 
    print(tagged) 

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('exciting', 'VBG'), ('area', 'NN'), ('.', '.')]
[('Huge', 'NNP'), ('budget', 'NN'), ('allocated', 'VBD'), ('.', '.')]


Bag of words

In [20]:
doc1 ="Game of Thrones is an amazing tv series!"
doc2 ="Game of Thrones is the best tv series!"
doc3 ="Game of Thrones is so great"

In [21]:
l_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()
l_doc2 = re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()
l_doc3 = re.sub(r"[^a-zA-Z0-9]"," ",doc3.lower()).split()

In [22]:
l_doc1

['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer(stop_words='english') 
x = vectorizer.fit_transform([doc1,doc2,doc3]) 
dfBow= pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out()) 
dfBow.head() 

Unnamed: 0,amazing,best,game,great,series,thrones,tv
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0


In [24]:
vectorizer.vocabulary_

{'game': 2,
 'thrones': 5,
 'amazing': 0,
 'tv': 6,
 'series': 4,
 'best': 1,
 'great': 3}

TF-IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
d1 = "petrol cars are cheaper than diesel cars"
d2 = "diesel is cheaper than petrol"

In [28]:
doc_corpus=[d1,d2]

In [38]:
tfVec = TfidfVectorizer(stop_words="english")
tfVec_Fit = tfVec.fit_transform(doc_corpus)
print(f"feature name found - {tfVec.get_feature_names_out()}")

feature name found - ['cars' 'cheaper' 'diesel' 'petrol']


In [39]:
print(f"feature name found - {tfVec.get_feature_names_out()}")

feature name found - ['cars' 'cheaper' 'diesel' 'petrol']


In [35]:
pd.DataFrame(tfVec_Fit.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.616664,0.308332,0.308332,0.219381,0.308332,0.308332,0.0,0.219381,0.308332,0.219381
1,0.0,0.0,0.0,0.448321,0.0,0.0,0.630099,0.448321,0.0,0.448321


Executing in a single cell

In [33]:
d1="petrol cars are cheaper than diesel cars" 
d2="diesel is cheaper than petrol" 
doc_corpus=[d1,d2] 
tfVec=TfidfVectorizer(stop_words='english',ngram_range=(1,3),max_features=10) 
tfVec_Fit =tfVec.fit_transform(doc_corpus) 
print(f"Feature Name found - {tfVec.get_feature_names_out()}") 
dfTFIDF= pd.DataFrame(tfVec_Fit.toarray(),columns=tfVec.get_feature_names_out()) 
dfTFIDF.head() 

Feature Name found - ['cars' 'cars cheaper' 'cars cheaper diesel' 'cheaper' 'cheaper diesel'
 'cheaper diesel cars' 'cheaper petrol' 'diesel' 'diesel cars' 'petrol']


Unnamed: 0,cars,cars cheaper,cars cheaper diesel,cheaper,cheaper diesel,cheaper diesel cars,cheaper petrol,diesel,diesel cars,petrol
0,0.616664,0.308332,0.308332,0.219381,0.308332,0.308332,0.0,0.219381,0.308332,0.219381
1,0.0,0.0,0.0,0.448321,0.0,0.0,0.630099,0.448321,0.0,0.448321


Cosine Similarity

In [42]:
import numpy as np
from numpy.linalg import norm

d1 = "The food is good and great"
d2 = "The food is not good"

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit_transform([d1,d2]) 
dfBow= pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out()) 
dfBow.head() 

tfVec=TfidfVectorizer(stop_words='english') 
tfVec_Fit =tfVec.fit_transform([d1,d2]) 
print(f"Feature Name found - {tfVec.get_feature_names_out()}") 
dfTFIDF= pd.DataFrame(tfVec_Fit.toarray(),columns=tfVec.get_feature_names_out()) 
dfTFIDF.head()

d1_count=[0.000000,0.449436,0.631667,0.631667] 
d2_count=[0.814802,0.579739,0.000000,0.000000] 
cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count)) 
print(f"Cosine Similarity is {cosine}") 

Feature Name found - ['food' 'good' 'great']
Cosine Similarity is 0.2605557435429248


Unnamed: 0,and,food,good,great,is,not,the
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0
