In [1]:
import nltk
nltk.download('stopwords')
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
sentence1 = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s"

In [3]:
import string
def Tokenise(sentence: str):
    punctuation = string.punctuation + '[]{}()<>'
    for char in punctuation:
        sentence = sentence.replace(char, " ")
    sentence = sentence.lower()
    tokens = sentence.split()
    return tokens

tokens = Tokenise(sentence1)
tokens

['lorem',
 'ipsum',
 'is',
 'simply',
 'dummy',
 'text',
 'of',
 'the',
 'printing',
 'and',
 'typesetting',
 'industry',
 'lorem',
 'ipsum',
 'has',
 'been',
 'the',
 'industry',
 's',
 'standard',
 'dummy',
 'text',
 'ever',
 'since',
 'the',
 '1500s']

In [4]:
def RemoveStopWords(token):
  stop_words = set(stopwords.words('english'))
  filtered_sentence=[word for word in tokens if not word in stop_words]

  return filtered_sentence

tokens = RemoveStopWords(tokens)
tokens

['lorem',
 'ipsum',
 'simply',
 'dummy',
 'text',
 'printing',
 'typesetting',
 'industry',
 'lorem',
 'ipsum',
 'industry',
 'standard',
 'dummy',
 'text',
 'ever',
 'since',
 '1500s']

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
pos_tag_list = pos_tag(tokens)
pos_tag_list

[('lorem', 'NN'),
 ('ipsum', 'NN'),
 ('simply', 'RB'),
 ('dummy', 'JJ'),
 ('text', 'NN'),
 ('printing', 'VBG'),
 ('typesetting', 'VBG'),
 ('industry', 'NN'),
 ('lorem', 'VBZ'),
 ('ipsum', 'JJ'),
 ('industry', 'NN'),
 ('standard', 'NN'),
 ('dummy', 'NN'),
 ('text', 'NN'),
 ('ever', 'RB'),
 ('since', 'IN'),
 ('1500s', 'CD')]

In [7]:
stemmer = PorterStemmer()
print("Stemming Words")
for w in tokens:
    print(f"{w} : {stemmer.stem(w)}")

Stemming Words
lorem : lorem
ipsum : ipsum
simply : simpli
dummy : dummi
text : text
printing : print
typesetting : typeset
industry : industri
lorem : lorem
ipsum : ipsum
industry : industri
standard : standard
dummy : dummi
text : text
ever : ever
since : sinc
1500s : 1500


In [8]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
lemmatizer = WordNetLemmatizer()

for w in tokens:
    print(f"{w} : {lemmatizer.lemmatize(w)}")

lorem : lorem
ipsum : ipsum
simply : simply
dummy : dummy
text : text
printing : printing
typesetting : typesetting
industry : industry
lorem : lorem
ipsum : ipsum
industry : industry
standard : standard
dummy : dummy
text : text
ever : ever
since : since
1500s : 1500s


In [11]:
def calculateTF(token):
    term_freq = {}
    for word in token:
        if word not in term_freq:
            term_freq[word] = token.count(word) / len(token)

    return term_freq

print(calculateTF(tokens))

{'lorem': 0.11764705882352941, 'ipsum': 0.11764705882352941, 'simply': 0.058823529411764705, 'dummy': 0.11764705882352941, 'text': 0.11764705882352941, 'printing': 0.058823529411764705, 'typesetting': 0.058823529411764705, 'industry': 0.11764705882352941, 'standard': 0.058823529411764705, 'ever': 0.058823529411764705, 'since': 0.058823529411764705, '1500s': 0.058823529411764705}


In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
def calculateTF_IDF(documents):
    documents = sent_tokenize(documents)
    document_map = {}
    document_tf = {}
    unique_words = set()
    word_idf = {}

    for i, document in enumerate(documents):
        tokenizedWords  = Tokenise(document)
        document_map[i] = tokenizedWords

        document_tf[i] = calculateTF(tokenizedWords)

        for word in tokenizedWords:
            unique_words.add(word)

    for word in unique_words:
        count = 0
        for _, tokenedWords in document_map.items():
            if word in tokenedWords:
                count += 1

        word_idf[word] = count

    return word_idf, document_tf


word_idf, document_tf = calculateTF_IDF(sentence1)
print(word_idf)

{'the': 2, 'industry': 2, 'text': 2, 'has': 1, 'of': 1, 's': 1, 'been': 1, 'standard': 1, 'ipsum': 2, 'and': 1, 'is': 1, '1500s': 1, 'dummy': 2, 'ever': 1, 'printing': 1, 'typesetting': 1, 'lorem': 2, 'since': 1, 'simply': 1}
