# 1. Tokenization

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

text = "Natural Language Processing (NLP) is amazing! Let's explore it."

# Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'amazing', '!', 'Let', "'s", 'explore', 'it', '.']
Sentence Tokens: ['Natural Language Processing (NLP) is amazing!', "Let's explore it."]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Lowercasing in NLP (Text Preprocessing Example)

In [34]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "Hello World! NLP is Fun."
tokens = word_tokenize(text)

# Lowercasing each token
lower_tokens = [word.lower() for word in tokens]
print(lower_tokens)


['hello', 'world', '!', 'nlp', 'is', 'fun', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 3. Lowercasing Using pandas (Useful for Large Text Datasets)

In [35]:
import pandas as pd

data = pd.DataFrame({'Text': ["HELLO World!", "THIS is NLP.", "Machine Learning"]})
data['Lowercase_Text'] = data['Text'].str.lower()

print(data)


               Text    Lowercase_Text
0      HELLO World!      hello world!
1      THIS is NLP.      this is nlp.
2  Machine Learning  machine learning


# 4. STOP WORD REMOVAL

In [24]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
text = "This is a simple example to demonstrate stopword removal."
word = text.split()
filtered_words = [word for word in word if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

Filtered Words: ['simple', 'example', 'demonstrate', 'stopword', 'removal.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 5. Stemming (Reducing Words to Their Root Form)

In [25]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in word_tokens]

print("Stemmed Words:", stemmed_words)


Stemmed Words: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'amaz', '!', 'let', "'s", 'explor', 'it', '.']


# 6. Lemmatization (More Advanced Root Word Extraction)

In [28]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]

print("Lemmatized Words:", lemmatized_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatized Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'amazing', '!', 'Let', "'s", 'explore', 'it', '.']


In [27]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...


True

# 7. Frequency Distribution of Words

In [31]:
from nltk.probability import FreqDist

fdist = FreqDist(word_tokens)
print("Most Common Words:", fdist.most_common(5))


Most Common Words: [('Natural', 1), ('Language', 1), ('Processing', 1), ('(', 1), ('NLP', 1)]


# 8. Punctuation removal

In [2]:
import string

# Raw Text (कच्चा पाठ)
text = "Hello, World! NLP is exciting; isn't it?"

# Punctuation Removal
text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))

print("Original Text:", text)
print("Text Without Punctuation:", text_without_punctuation)


Original Text: Hello, World! NLP is exciting; isn't it?
Text Without Punctuation: Hello World NLP is exciting isnt it


# 9. Special Character removal

In [4]:
import re

# Raw Text 
text = "Hello, World! This text contains special characters like @, #, $, %, and &."

# Special Characters Removal
text_without_special_chars = re.sub(r'[@#$%&]', '', text)

print("Original Text:", text)
print("Text Without Special Characters:", text_without_special_chars)


Original Text: Hello, World! This text contains special characters like @, #, $, %, and &.
Text Without Special Characters: Hello, World! This text contains special characters like , , , , and .


# 10. Whitespace Removal

In [6]:
import re

# Raw Text 
text = "Hello,     World!   This is   a sample   text.  "

# Whitespace Removal
normalized_text = re.sub(r'\s+', ' ', text).strip()

print("Original Text:", text)
print("Text Without Extra Whitespaces:", normalized_text)


Original Text: Hello,     World!   This is   a sample   text.  
Text Without Extra Whitespaces: Hello, World! This is a sample text.


# 11. Named Entity Recognition (NER)

In [30]:
from nltk.chunk import ne_chunk

nltk.download('maxent_ne_chunker')
nltk.download('words')

ner_tree = ne_chunk(pos_tags)
print("Named Entities:", ner_tree)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...


Named Entities: (S
  Natural/JJ
  Language/NNP
  Processing/NNP
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  is/VBZ
  amazing/JJ
  !/.
  Let/NNP
  's/POS
  explore/VB
  it/PRP
  ./.)


[nltk_data]   Unzipping corpora\words.zip.


# 12. Part-of-Speech (POS) Tagging

In [3]:
import nltk
from nltk import word_tokenize, pos_tag

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize and tag
tokens = word_tokenize(sentence)
tags = pos_tag(tokens)

# Display POS tags
print(tags)


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]
