# *Tokenization, Lemmatization & Stemming - Using NLTK*
### *`Code by- @Om`*

## *Tokenization, Lemmatization & Stemming - Single sentence*

## *Tokenization - Using NLTK*

In [1]:
# Functions/Dependencies
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Stopwords removal function
def remove_stopw_nltk(sent):
    stopws = stopwords.words('english')
    words = sent.split()
    print("Words before stopwords removal: ",len(words))
    sent = ""
    for w in words:
        if w in stopws: words.remove(w)
    for w in words:
        sent=sent+w+" "
    print("Words after stopwords removal: ",len(words),"\n")
    return sent

# Tokenization function
def tokenize(sent):
    sent = remove_stopw_nltk(sent)
    print("\nModified Sentence: ",sent,"\n")
    tokens = word_tokenize(sent)
    return tokens

In [2]:
sent = "This is a new test sentence for checking tokenization using NLTK package"
print("\n----Tokenization of given sentence - With NLTK------\n")
print("Original Sentence: ",sent,"\n")

tokens = tokenize(sent)

# Results
print("Number of tokens found - ", len(tokens))
print("Tokens: ",tokens)


----Tokenization of given sentence - With NLTK------

Original Sentence:  This is a new test sentence for checking tokenization using NLTK package 

Words before stopwords removal:  12
Words after stopwords removal:  10 


Modified Sentence:  This a new test sentence checking tokenization using NLTK package  

Number of tokens found -  10
Tokens:  ['This', 'a', 'new', 'test', 'sentence', 'checking', 'tokenization', 'using', 'NLTK', 'package']


## *Stemming using NLTK*

In [3]:
# Stemming function
from nltk.stem import PorterStemmer

def stem_sentence(sent):
    ps = PorterStemmer()
    sent = remove_stopw_nltk(sent)
    words = word_tokenize(sent)
    stemmed = {}
    for w in words:
        stemmed[w] = ps.stem(w)
    return stemmed

In [4]:
sent = "This is a new test sentence for checking tokenization using NLTK package"
print("\n----Stemming of given sentence - With NLTK------\n")
print("Original Sentence: ",sent,"\n")

stem_dict = stem_sentence(sent)
print("\nStemming Results: ")
print("------------------")
for k in stem_dict:
    print(k," : ", stem_dict[k])


----Stemming of given sentence - With NLTK------

Original Sentence:  This is a new test sentence for checking tokenization using NLTK package 

Words before stopwords removal:  12
Words after stopwords removal:  10 


Stemming Results: 
------------------
This  :  thi
a  :  a
new  :  new
test  :  test
sentence  :  sentenc
checking  :  check
tokenization  :  token
using  :  use
NLTK  :  nltk
package  :  packag


## *Lemmatization using NLTK*

In [5]:
# Lemmatization function
from nltk.stem import WordNetLemmatizer

def lemmatize(sent):
    lemmatizer = WordNetLemmatizer()
    sent = remove_stopw_nltk(sent)
    words = word_tokenize(sent)
    lem_dict = {}
    for w in words:
        lem_dict[w] = lemmatizer.lemmatize(w, pos="a")
    return lem_dict

In [6]:
# Calling functions for lemmatization
sent = "This is a better test sentence for checking tokenization using NLTK package"
print("\n----Lemmatization of given sentence - With NLTK------\n")
print("Original Sentence: ",sent,"\n")

lem_dict = lemmatize(sent)

print("Lemmatization Results: ")
print("------------------")
for k in lem_dict:
    print(k," : ", lem_dict[k])


----Lemmatization of given sentence - With NLTK------

Original Sentence:  This is a better test sentence for checking tokenization using NLTK package 

Words before stopwords removal:  12
Words after stopwords removal:  10 

Lemmatization Results: 
------------------
This  :  This
a  :  a
better  :  good
test  :  test
sentence  :  sentence
checking  :  checking
tokenization  :  tokenization
using  :  using
NLTK  :  NLTK
package  :  package


## *Tokenization, Lemmatization & Stemming - Paragraph (Multiple Sentences)(Using .txt file)*

In [7]:
print("\n----Tokenization of paragraph - With NLTK------\n")
data = open("Data_multi.txt")
s = ""

# Get data from text file
for l in data: s+=l;

# Tokenization
res = list(set(tokenize(s)))
print("Tokens found are: ")
print("--------------------\n", res)


----Tokenization of paragraph - With NLTK------

Words before stopwords removal:  51
Words after stopwords removal:  39 


Modified Sentence:  Google LLC an American multinational technology company specializes Internet-related services products, include online advertising technologies, search engine, cloud computing, software, hardware. It considered one the Big Five companies the American information technology industry, along Amazon, Apple, Meta (Facebook) Microsoft.  

Tokens found are: 
--------------------
 ['online', ')', 'the', 'services', ',', 'Apple', 'Microsoft', 'technology', 'companies', 'advertising', 'cloud', '.', 'Internet-related', 'Big', 'It', 'Amazon', 'Facebook', 'an', 'company', 'search', 'industry', 'along', 'technologies', 'considered', 'multinational', 'specializes', 'LLC', 'hardware', 'engine', 'American', 'Google', 'one', 'software', 'products', 'include', 'Meta', '(', 'information', 'Five', 'computing']


In [8]:
print("\n----Stemming of paragraph - With NLTK------\n")
data = open("Data_multi.txt")
s = ""

# Get data from text file
for l in data: s+=l;

# Stemming
res = stem_sentence(s)
print("\nStemming results: ")
print("--------------------\n")
for k in res:
    print(k," : ",res[k])


----Stemming of paragraph - With NLTK------

Words before stopwords removal:  51
Words after stopwords removal:  39 


Stemming results: 
--------------------

Google  :  googl
LLC  :  llc
an  :  an
American  :  american
multinational  :  multin
technology  :  technolog
company  :  compani
specializes  :  special
Internet-related  :  internet-rel
services  :  servic
products  :  product
,  :  ,
include  :  includ
online  :  onlin
advertising  :  advertis
technologies  :  technolog
search  :  search
engine  :  engin
cloud  :  cloud
computing  :  comput
software  :  softwar
hardware  :  hardwar
.  :  .
It  :  it
considered  :  consid
one  :  one
the  :  the
Big  :  big
Five  :  five
companies  :  compani
information  :  inform
industry  :  industri
along  :  along
Amazon  :  amazon
Apple  :  appl
Meta  :  meta
(  :  (
Facebook  :  facebook
)  :  )
Microsoft  :  microsoft


In [9]:
print("\n----Lemmatization of paragraph - With NLTK------\n")
data = open("Data_multi.txt")
s = ""

# Get data from text file
for l in data: s+=l;

# Stemming
res = lemmatize(s)
print("\nLemmatization results: ")
print("--------------------\n")
for k in res:
    print(k," : ",res[k])


----Lemmatization of paragraph - With NLTK------

Words before stopwords removal:  51
Words after stopwords removal:  39 


Lemmatization results: 
--------------------

Google  :  Google
LLC  :  LLC
an  :  an
American  :  American
multinational  :  multinational
technology  :  technology
company  :  company
specializes  :  specializes
Internet-related  :  Internet-related
services  :  services
products  :  products
,  :  ,
include  :  include
online  :  online
advertising  :  advertising
technologies  :  technologies
search  :  search
engine  :  engine
cloud  :  cloud
computing  :  computing
software  :  software
hardware  :  hardware
.  :  .
It  :  It
considered  :  considered
one  :  one
the  :  the
Big  :  Big
Five  :  Five
companies  :  companies
information  :  information
industry  :  industry
along  :  along
Amazon  :  Amazon
Apple  :  Apple
Meta  :  Meta
(  :  (
Facebook  :  Facebook
)  :  )
Microsoft  :  Microsoft


## *Tokenization, Lemmatization & Stemming - Multiple Documents (Multiple Sentences)(Using .txt files)*

In [10]:
print("\n----Tokenization of multiple documents - With NLTK------\n")
doc_names = ["Data_multi.txt", "Data_2.txt", "Data_3.txt"]

c=1
# Loop to get data and tokenize
for doc in doc_names:
    print("\n\n--------- Document ",c,"-------------\n")
    data = open(doc)
    s = ""
    
    # Get data from text file
    for l in data: s+=l;
    
    # Tokenization
    res = list(set(tokenize(s)))
    print("Tokens found: ", len(res))
    print("--------------------\n", res,"\n\n")
    c+=1


----Tokenization of multiple documents - With NLTK------



--------- Document  1 -------------

Words before stopwords removal:  51
Words after stopwords removal:  39 


Modified Sentence:  Google LLC an American multinational technology company specializes Internet-related services products, include online advertising technologies, search engine, cloud computing, software, hardware. It considered one the Big Five companies the American information technology industry, along Amazon, Apple, Meta (Facebook) Microsoft.  

Tokens found:  40
--------------------
 ['online', ')', 'the', 'services', ',', 'Apple', 'Microsoft', 'technology', 'companies', 'advertising', 'cloud', '.', 'Internet-related', 'Big', 'It', 'Amazon', 'Facebook', 'an', 'company', 'search', 'industry', 'along', 'technologies', 'considered', 'multinational', 'specializes', 'LLC', 'hardware', 'engine', 'American', 'Google', 'one', 'software', 'products', 'include', 'Meta', '(', 'information', 'Five', 'computing'] 




---

In [11]:
print("\n----Stemming of multiple documents - With NLTK------\n")
doc_names = ["Data_multi.txt", "Data_2.txt", "Data_3.txt"]

c=1
# Loop to get data and tokenize
for doc in doc_names:
    print("\n\n--------- Document ",c,"-------------\n")
    data = open(doc)
    s = ""
    
    # Get data from text file
    for l in data: s+=l;
    
    # Tokenization
    res = stem_sentence(s)
    print("Stemming results: ")
    print("--------------------\n", res,"\n\n")
    c+=1


----Stemming of multiple documents - With NLTK------



--------- Document  1 -------------

Words before stopwords removal:  51
Words after stopwords removal:  39 

Stemming results: 
--------------------
 {'Google': 'googl', 'LLC': 'llc', 'an': 'an', 'American': 'american', 'multinational': 'multin', 'technology': 'technolog', 'company': 'compani', 'specializes': 'special', 'Internet-related': 'internet-rel', 'services': 'servic', 'products': 'product', ',': ',', 'include': 'includ', 'online': 'onlin', 'advertising': 'advertis', 'technologies': 'technolog', 'search': 'search', 'engine': 'engin', 'cloud': 'cloud', 'computing': 'comput', 'software': 'softwar', 'hardware': 'hardwar', '.': '.', 'It': 'it', 'considered': 'consid', 'one': 'one', 'the': 'the', 'Big': 'big', 'Five': 'five', 'companies': 'compani', 'information': 'inform', 'industry': 'industri', 'along': 'along', 'Amazon': 'amazon', 'Apple': 'appl', 'Meta': 'meta', '(': '(', 'Facebook': 'facebook', ')': ')', 'Microsoft': 'm

In [12]:
print("\n----Lemmatization of multiple documents - With NLTK------\n")
doc_names = ["Data_multi.txt", "Data_2.txt", "Data_3.txt"]

c=1
# Loop to get data and tokenize
for doc in doc_names:
    print("\n\n--------- Document ",c,"-------------\n")
    data = open(doc)
    s = ""
    
    # Get data from text file
    for l in data: s+=l;
    
    # Tokenization
    res = lemmatize(s)
    print("Lemmatization results: ")
    print("--------------------\n", res,"\n\n")
    c+=1


----Lemmatization of multiple documents - With NLTK------



--------- Document  1 -------------

Words before stopwords removal:  51
Words after stopwords removal:  39 

Lemmatization results: 
--------------------
 {'Google': 'Google', 'LLC': 'LLC', 'an': 'an', 'American': 'American', 'multinational': 'multinational', 'technology': 'technology', 'company': 'company', 'specializes': 'specializes', 'Internet-related': 'Internet-related', 'services': 'services', 'products': 'products', ',': ',', 'include': 'include', 'online': 'online', 'advertising': 'advertising', 'technologies': 'technologies', 'search': 'search', 'engine': 'engine', 'cloud': 'cloud', 'computing': 'computing', 'software': 'software', 'hardware': 'hardware', '.': '.', 'It': 'It', 'considered': 'considered', 'one': 'one', 'the': 'the', 'Big': 'Big', 'Five': 'Five', 'companies': 'companies', 'information': 'information', 'industry': 'industry', 'along': 'along', 'Amazon': 'Amazon', 'Apple': 'Apple', 'Meta': 'Meta', '('