In [1]:
import nltk
nltk.download('stopwords')
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rohandoshi21/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sentence1 = "Stemming and lemmatization are different techniques used to reduce words to their root form, but they produce varying results. Lemmatization is better than stemming"

In [3]:
import string
def Tokenise(sentence: str):
    punctuation = string.punctuation + '[]{}()<>'  # Add brackets to punctuation marks
    for char in punctuation:
        sentence = sentence.replace(char, " ")
    sentence = sentence.lower()
    tokens = sentence.split()
    return tokens
    
tokens = Tokenise(sentence1)
tokens

['stemming',
 'and',
 'lemmatization',
 'are',
 'different',
 'techniques',
 'used',
 'to',
 'reduce',
 'words',
 'to',
 'their',
 'root',
 'form',
 'but',
 'they',
 'produce',
 'varying',
 'results',
 'lemmatization',
 'is',
 'better',
 'than',
 'stemming']

In [4]:
def RemoveStopWords(token):
  stop_words = set(stopwords.words('english'))
  filtered_sentence=[word for word in tokens if not word in stop_words]

  return filtered_sentence

tokens = RemoveStopWords(tokens)
tokens

['stemming',
 'lemmatization',
 'different',
 'techniques',
 'used',
 'reduce',
 'words',
 'root',
 'form',
 'produce',
 'varying',
 'results',
 'lemmatization',
 'better',
 'stemming']

In [5]:
pos_tag_list = pos_tag(tokens)
pos_tag_list

[('stemming', 'VBG'),
 ('lemmatization', 'NN'),
 ('different', 'JJ'),
 ('techniques', 'NNS'),
 ('used', 'VBN'),
 ('reduce', 'VB'),
 ('words', 'NNS'),
 ('root', 'VBP'),
 ('form', 'NN'),
 ('produce', 'VBP'),
 ('varying', 'VBG'),
 ('results', 'NNS'),
 ('lemmatization', 'NN'),
 ('better', 'RBR'),
 ('stemming', 'NN')]

In [6]:
stemmer = PorterStemmer()
print("Stemming Words")
for w in tokens:
    print(f"{w} : {stemmer.stem(w)}")

Stemming Words
stemming : stem
lemmatization : lemmat
different : differ
techniques : techniqu
used : use
reduce : reduc
words : word
root : root
form : form
produce : produc
varying : vari
results : result
lemmatization : lemmat
better : better
stemming : stem


In [7]:
lemmatizer = WordNetLemmatizer()

for w in tokens:
    print(f"{w} : {lemmatizer.lemmatize(w)}")

stemming : stemming
lemmatization : lemmatization
different : different
techniques : technique
used : used
reduce : reduce
words : word
root : root
form : form
produce : produce
varying : varying
results : result
lemmatization : lemmatization
better : better
stemming : stemming


In [8]:
def calculateTF(token):
    term_freq = {}
    for word in token:
        if word not in term_freq:
            term_freq[word] = token.count(word) / len(token)

    return term_freq

calculateTF(tokens)

{'stemming': 0.13333333333333333,
 'lemmatization': 0.13333333333333333,
 'different': 0.06666666666666667,
 'techniques': 0.06666666666666667,
 'used': 0.06666666666666667,
 'reduce': 0.06666666666666667,
 'words': 0.06666666666666667,
 'root': 0.06666666666666667,
 'form': 0.06666666666666667,
 'produce': 0.06666666666666667,
 'varying': 0.06666666666666667,
 'results': 0.06666666666666667,
 'better': 0.06666666666666667}

In [9]:
def calculateTF_IDF(documents):
    documents = sent_tokenize(documents)
    document_map = {}
    document_tf = {}
    unique_words = set()
    word_idf = {}
    
    for i, document in enumerate(documents):
        tokenizedWords  = Tokenise(document)
        document_map[i] = tokenizedWords

        document_tf[i] = calculateTF(tokenizedWords)

        for word in tokenizedWords:
            unique_words.add(word)

    for word in unique_words:
        count = 0
        for _, tokenedWords in document_map.items():
            if word in tokenedWords:
                count += 1

        word_idf[word] = count

    return word_idf, document_tf
        

word_idf, document_tf = calculateTF_IDF(sentence1)
print(word_idf)

{'produce': 1, 'words': 1, 'techniques': 1, 'different': 1, 'reduce': 1, 'and': 1, 'is': 1, 'varying': 1, 'their': 1, 'are': 1, 'used': 1, 'root': 1, 'stemming': 2, 'but': 1, 'than': 1, 'they': 1, 'better': 1, 'to': 1, 'form': 1, 'results': 1, 'lemmatization': 2}
