In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
corpus = "This assignment is a text analytics one. Here, processes like tokenization, POS-tagging, Stemming, Lemmatization, etc. have been applied. It also includes TFIDF."

In [4]:
# Tokenization
from nltk import word_tokenize, sent_tokenize
print(word_tokenize(corpus))

['This', 'assignment', 'is', 'a', 'text', 'analytics', 'one', '.', 'Here', ',', 'processes', 'like', 'tokenization', ',', 'POS-tagging', ',', 'Stemming', ',', 'Lemmatization', ',', 'etc', '.', 'have', 'been', 'applied', '.', 'It', 'also', 'includes', 'TFIDF', '.']


In [5]:
print(sent_tokenize(corpus))

['This assignment is a text analytics one.', 'Here, processes like tokenization, POS-tagging, Stemming, Lemmatization, etc.', 'have been applied.', 'It also includes TFIDF.']


In [6]:
# POS Tagging
from nltk import pos_tag
tokens = word_tokenize(corpus)
print(pos_tag(tokens))

[('This', 'DT'), ('assignment', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('text', 'JJ'), ('analytics', 'NN'), ('one', 'CD'), ('.', '.'), ('Here', 'RB'), (',', ','), ('processes', 'NNS'), ('like', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS-tagging', 'NNP'), (',', ','), ('Stemming', 'NNP'), (',', ','), ('Lemmatization', 'NNP'), (',', ','), ('etc', 'NN'), ('.', '.'), ('have', 'VBP'), ('been', 'VBN'), ('applied', 'VBN'), ('.', '.'), ('It', 'PRP'), ('also', 'RB'), ('includes', 'VBZ'), ('TFIDF', 'NNP'), ('.', '.')]


In [11]:
# Stopword removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(corpus)
cleaned_tokens = []
for token in tokens:
    if token not in stop_words:
        cleaned_tokens.append(token)
print(cleaned_tokens)

['This', 'assignment', 'text', 'analytics', 'one', '.', 'Here', ',', 'processes', 'like', 'tokenization', ',', 'POS-tagging', ',', 'Stemming', ',', 'Lemmatization', ',', 'etc', '.', 'applied', '.', 'It', 'also', 'includes', 'TFIDF', '.']


In [12]:
# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
tokens = word_tokenize(corpus)
stemmed_tokens = []
for token in tokens:
    stem_token = stemmer.stem(token)
    stemmed_tokens.append(stem_token)
print(stemmed_tokens)

['thi', 'assign', 'is', 'a', 'text', 'analyt', 'one', '.', 'here', ',', 'process', 'like', 'token', ',', 'pos-tag', ',', 'stem', ',', 'lemmat', ',', 'etc', '.', 'have', 'been', 'appli', '.', 'it', 'also', 'includ', 'tfidf', '.']


In [13]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(corpus)
lemmatized_tokens = []
for token in tokens:
    lemm_token = lemmatizer.lemmatize(token)
    lemmatized_tokens.append(lemm_token)
print(lemmatized_tokens)

['This', 'assignment', 'is', 'a', 'text', 'analytics', 'one', '.', 'Here', ',', 'process', 'like', 'tokenization', ',', 'POS-tagging', ',', 'Stemming', ',', 'Lemmatization', ',', 'etc', '.', 'have', 'been', 'applied', '.', 'It', 'also', 'includes', 'TFIDF', '.']


In [16]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["This assignment is a text analytics one.", 
          "Here, processes like tokenization, POS-tagging, Stemming, Lemmatization, etc. have been applied." ,
          "It also includes TFIDF."]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit(corpus)
matrix.vocabulary_

{'this': 20,
 'assignment': 3,
 'is': 9,
 'text': 18,
 'analytics': 1,
 'one': 13,
 'here': 7,
 'processes': 15,
 'like': 12,
 'tokenization': 21,
 'pos': 14,
 'tagging': 17,
 'stemming': 16,
 'lemmatization': 11,
 'etc': 5,
 'have': 6,
 'been': 4,
 'applied': 2,
 'it': 10,
 'also': 0,
 'includes': 8,
 'tfidf': 19}

In [17]:
tfidf_matrix = vectorizer.transform(corpus)

In [18]:
print(tfidf_matrix)

  (0, 20)	0.4082482904638631
  (0, 18)	0.4082482904638631
  (0, 13)	0.4082482904638631
  (0, 9)	0.4082482904638631
  (0, 3)	0.4082482904638631
  (0, 1)	0.4082482904638631
  (1, 21)	0.2886751345948129
  (1, 17)	0.2886751345948129
  (1, 16)	0.2886751345948129
  (1, 15)	0.2886751345948129
  (1, 14)	0.2886751345948129
  (1, 12)	0.2886751345948129
  (1, 11)	0.2886751345948129
  (1, 7)	0.2886751345948129
  (1, 6)	0.2886751345948129
  (1, 5)	0.2886751345948129
  (1, 4)	0.2886751345948129
  (1, 2)	0.2886751345948129
  (2, 19)	0.5
  (2, 10)	0.5
  (2, 8)	0.5
  (2, 0)	0.5


In [19]:
print(vectorizer.get_feature_names_out())

['also' 'analytics' 'applied' 'assignment' 'been' 'etc' 'have' 'here'
 'includes' 'is' 'it' 'lemmatization' 'like' 'one' 'pos' 'processes'
 'stemming' 'tagging' 'text' 'tfidf' 'this' 'tokenization']
