In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Tokenizing**

In [5]:
from nltk import sent_tokenize,word_tokenize

In [6]:
sentence1="This is a sample document for text preprocessing. We will apply various methods like tokenization, POS tagging, stop words removal, stemming, and lemmatization."
sentence2="It seems you've mentioned sentence but didn't specify any particular task or operation you'd like to perform on it."

In [8]:
sent_tokenize(sentence1)


['This is a sample document for text preprocessing.',
 'We will apply various methods like tokenization, POS tagging, stop words removal, stemming, and lemmatization.']

In [9]:
word_tokenize(sentence1)

['This',
 'is',
 'a',
 'sample',
 'document',
 'for',
 'text',
 'preprocessing',
 '.',
 'We',
 'will',
 'apply',
 'various',
 'methods',
 'like',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'words',
 'removal',
 ',',
 'stemming',
 ',',
 'and',
 'lemmatization',
 '.']

**POS Tagging**

In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [12]:
from nltk import pos_tag


In [14]:
token=word_tokenize(sentence1)+word_tokenize(sentence2)
tagged=pos_tag(token)
tagged

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('document', 'NN'),
 ('for', 'IN'),
 ('text', 'NN'),
 ('preprocessing', 'NN'),
 ('.', '.'),
 ('We', 'PRP'),
 ('will', 'MD'),
 ('apply', 'VB'),
 ('various', 'JJ'),
 ('methods', 'NNS'),
 ('like', 'IN'),
 ('tokenization', 'NN'),
 (',', ','),
 ('POS', 'NNP'),
 ('tagging', 'NN'),
 (',', ','),
 ('stop', 'VB'),
 ('words', 'NNS'),
 ('removal', 'JJ'),
 (',', ','),
 ('stemming', 'VBG'),
 (',', ','),
 ('and', 'CC'),
 ('lemmatization', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('seems', 'VBZ'),
 ('you', 'PRP'),
 ("'ve", 'VBP'),
 ('mentioned', 'VBN'),
 ('sentence', 'NN'),
 ('but', 'CC'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('specify', 'VB'),
 ('any', 'DT'),
 ('particular', 'JJ'),
 ('task', 'NN'),
 ('or', 'CC'),
 ('operation', 'NN'),
 ('you', 'PRP'),
 ("'d", 'MD'),
 ('like', 'VB'),
 ('to', 'TO'),
 ('perform', 'VB'),
 ('on', 'IN'),
 ('it', 'PRP'),
 ('.', '.')]

Stop **Word** Removal

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
from nltk.corpus import stopwords
stwd=stopwords.words('english')
token=word_tokenize(sentence1)
cleaned_token=[]
for words in token:
  if words  not in stwd:
    cleaned_token.append(words)

token
cleaned_token

['This',
 'sample',
 'document',
 'text',
 'preprocessing',
 '.',
 'We',
 'apply',
 'various',
 'methods',
 'like',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'words',
 'removal',
 ',',
 'stemming',
 ',',
 'lemmatization',
 '.']

**Stemming**

In [22]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
token=word_tokenize(sentence1)
stemmed=[stemmer.stem(word) for word in token]
stemmed

['thi',
 'is',
 'a',
 'sampl',
 'document',
 'for',
 'text',
 'preprocess',
 '.',
 'we',
 'will',
 'appli',
 'variou',
 'method',
 'like',
 'token',
 ',',
 'po',
 'tag',
 ',',
 'stop',
 'word',
 'remov',
 ',',
 'stem',
 ',',
 'and',
 'lemmat',
 '.']

**Lemmatization**

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [24]:
from nltk.stem import WordNetLemmatizer
lemtizer=WordNetLemmatizer()
token=word_tokenize(sentence1)
lem=[lemtizer.lemmatize(word) for word in token]

lem

['This',
 'is',
 'a',
 'sample',
 'document',
 'for',
 'text',
 'preprocessing',
 '.',
 'We',
 'will',
 'apply',
 'various',
 'method',
 'like',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'word',
 'removal',
 ',',
 'stemming',
 ',',
 'and',
 'lemmatization',
 '.']

**SECTION B**

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
document1="This is a sample document for text preprocessing.We will apply various methods like tokenization, POS tagging, stop words removal, stemming, and lemmatization."
document2="We preprocess the sample document using various methods such as tokenization, POS tagging, stop words removal, stemming, and lemmatization.We then calculate the TF-IDF representation of the document using scikit-learn's TfidfVectorizer class."

**Creating Bag of Words**

In [35]:
bagofwordsA=document1.split(' ')
bagofwordsB=document2.split(' ')

In [37]:
uniquewords=set(bagofwordsA).union(set(bagofwordsB))

In [46]:
noofwordsA=dict.fromkeys(uniquewords,0)
for word in bagofwordsA:
  noofwordsA[word]+=1


noofwordsB=dict.fromkeys(uniquewords,0)
for word in bagofwordsB:
  noofwordsB[word]+=1


**Computing Term Frequency (TF)**

In [49]:
def comptf(worddic,bagofwords):
  tfdict={}
  bagofwordscnt=len(bagofwords)
  for word,count in worddic.items():
    tfdict[word]=count/float(bagofwordscnt)
  return tfdict

In [50]:
tfA=comptf(noofwordsA,bagofwordsA)
tfB=comptf(noofwordsB,bagofwordsB)


In [51]:
tfA

{'TfidfVectorizer': 0.0,
 'various': 0.045454545454545456,
 'We': 0.0,
 'class.': 0.0,
 'like': 0.045454545454545456,
 'sample': 0.045454545454545456,
 'text': 0.045454545454545456,
 'as': 0.0,
 'This': 0.045454545454545456,
 'such': 0.0,
 'will': 0.045454545454545456,
 'words': 0.045454545454545456,
 'preprocess': 0.0,
 'then': 0.0,
 "scikit-learn's": 0.0,
 'stop': 0.045454545454545456,
 'a': 0.045454545454545456,
 'tokenization,': 0.045454545454545456,
 'preprocessing.We': 0.045454545454545456,
 'the': 0.0,
 'is': 0.045454545454545456,
 'lemmatization.We': 0.0,
 'using': 0.0,
 'for': 0.045454545454545456,
 'calculate': 0.0,
 'of': 0.0,
 'document': 0.045454545454545456,
 'removal,': 0.045454545454545456,
 'apply': 0.045454545454545456,
 'stemming,': 0.045454545454545456,
 'TF-IDF': 0.0,
 'and': 0.045454545454545456,
 'POS': 0.045454545454545456,
 'lemmatization.': 0.045454545454545456,
 'methods': 0.045454545454545456,
 'representation': 0.0,
 'tagging,': 0.045454545454545456}

In [52]:
tfB

{'TfidfVectorizer': 0.03225806451612903,
 'various': 0.03225806451612903,
 'We': 0.03225806451612903,
 'class.': 0.03225806451612903,
 'like': 0.0,
 'sample': 0.03225806451612903,
 'text': 0.0,
 'as': 0.03225806451612903,
 'This': 0.0,
 'such': 0.03225806451612903,
 'will': 0.0,
 'words': 0.03225806451612903,
 'preprocess': 0.03225806451612903,
 'then': 0.03225806451612903,
 "scikit-learn's": 0.03225806451612903,
 'stop': 0.03225806451612903,
 'a': 0.0,
 'tokenization,': 0.03225806451612903,
 'preprocessing.We': 0.0,
 'the': 0.0967741935483871,
 'is': 0.0,
 'lemmatization.We': 0.03225806451612903,
 'using': 0.06451612903225806,
 'for': 0.0,
 'calculate': 0.03225806451612903,
 'of': 0.03225806451612903,
 'document': 0.06451612903225806,
 'removal,': 0.03225806451612903,
 'apply': 0.0,
 'stemming,': 0.03225806451612903,
 'TF-IDF': 0.03225806451612903,
 'and': 0.03225806451612903,
 'POS': 0.03225806451612903,
 'lemmatization.': 0.0,
 'methods': 0.03225806451612903,
 'representation': 0.03

**Computing Inverse Document Frequecy**

In [53]:
def compidf(docs):
  import math
  N=len(docs)
  idfdict=dict.fromkeys(docs[0].keys(),0)
  for doc in docs:
    for word,val in doc.items():
      if val>0:
        idfdict[word]+=1
  for word,val in idfdict.items():
    idfdict[word]=math.log(N/float(val))
  return idfdict

In [54]:
idf=compidf([noofwordsA,noofwordsB])
idf

{'TfidfVectorizer': 0.6931471805599453,
 'various': 0.0,
 'We': 0.6931471805599453,
 'class.': 0.6931471805599453,
 'like': 0.6931471805599453,
 'sample': 0.0,
 'text': 0.6931471805599453,
 'as': 0.6931471805599453,
 'This': 0.6931471805599453,
 'such': 0.6931471805599453,
 'will': 0.6931471805599453,
 'words': 0.0,
 'preprocess': 0.6931471805599453,
 'then': 0.6931471805599453,
 "scikit-learn's": 0.6931471805599453,
 'stop': 0.0,
 'a': 0.6931471805599453,
 'tokenization,': 0.0,
 'preprocessing.We': 0.6931471805599453,
 'the': 0.6931471805599453,
 'is': 0.6931471805599453,
 'lemmatization.We': 0.6931471805599453,
 'using': 0.6931471805599453,
 'for': 0.6931471805599453,
 'calculate': 0.6931471805599453,
 'of': 0.6931471805599453,
 'document': 0.0,
 'removal,': 0.0,
 'apply': 0.6931471805599453,
 'stemming,': 0.0,
 'TF-IDF': 0.6931471805599453,
 'and': 0.0,
 'POS': 0.0,
 'lemmatization.': 0.6931471805599453,
 'methods': 0.0,
 'representation': 0.6931471805599453,
 'tagging,': 0.0}

**Computing TF/IDF**

In [56]:
def comptfidf(bagofwords,idf):
  tfidf={}
  for word,val in tfidf.items():
    tfidf[word]=val*idf[word]
  return tfidf

In [58]:
tfidfA=comptfidf(bagofwordsA,idf)
tfidfB=comptfidf(bagofwordsB,idf)
df=pd.DataFrame(tfidfA,tfidfB)
df