### Text Preprocessing using NLTK

In [11]:
# import NLTK Library
import nltk

In [12]:
#nltk.download()

In [13]:
from nltk.book import *

### Word Tokenizer & Sentence Tokennizer

In [14]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [15]:
text = "my name is raj dalsaniya. I am R&D engineer. I have intersted in data science and AI."

## Word Tokenizer

#### Word Tokenization is the process of breaking down a sentence or text into individual words, called tokens. It helps in natural language processing (NLP) by making text easier to analyze and process.

#### For example, given the sentence: 
"Hello! How are you?"
Word tokenization splits it into:
['Hello', '!', 'How', 'are', 'you', '?']

In [16]:
from nltk.tokenize import word_tokenize 

#text = "Hello! How are you?"
tokens = word_tokenize(text)
print(tokens)

['my', 'name', 'is', 'raj', 'dalsaniya', '.', 'I', 'am', 'R', '&', 'D', 'engineer', '.', 'I', 'have', 'intersted', 'in', 'data', 'science', 'and', 'AI', '.']


## 🔹 Sentence Tokenization

#### Sentence Tokenization (or Sentence Segmentation) is the process of dividing a text into individual sentences. It helps in natural language processing (NLP) by breaking down large texts into manageable parts.

### For example, given the text:
👉 "Hello! How are you? I hope you're doing well."

### Sentence tokenization splits it into:
✅ ["Hello!", "How are you?", "I hope you're doing well."]

In [17]:
sent_token = sent_tokenize(text)
print(sent_token)

['my name is raj dalsaniya.', 'I am R&D engineer.', 'I have intersted in data science and AI.']


## StopWords

#### Stopwords are common words (like "is", "the", "and", "in") that do not add much meaning to a sentence and are often removed during text processing to improve efficiency in NLP tasks like text analysis and search engines.

In [18]:
from nltk.corpus import stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
lower_text = text.lower()

# Tokenize words
tokens = word_tokenize(lower_text)

In [21]:
stopwords_list = set(stopwords.words('english'))  # Use set for faster lookup

In [22]:
filter_text = [word for word in tokens if word not in stopwords_list]


# Print filter text after stop words
print(filter_text)

['name', 'raj', 'dalsaniya', '.', 'r', '&', 'engineer', '.', 'intersted', 'data', 'science', 'ai', '.']


## 🔹 Stemming in NLP
#### Stemming is the process of reducing words to their root form by removing suffixes. It helps in normalizing words, making NLP tasks more efficient.

#### For example:
✅ "running" → "run"
✅ "happily" → "happili" (not always a real word)

#### Stemming is faster but sometimes produces non-dictionary words because it simply chops off suffixes without understanding meaning.

In [23]:
from nltk.stem import PorterStemmer

In [31]:
stemming = PorterStemmer()
text = "my name is raj dalsaniya. I am R&D engineer. I have intersted in data science and AI."

for i in word_tokenize(text):
    print(stemming.stem(i),end =" ")

my name is raj dalsaniya . i am r & d engin . i have interst in data scienc and ai . 

### Lemmatizer

#### A lemmatizer is a tool or algorithm that reduces words to their base or lemma form. Unlike a stemmer, which simply chops off affixes (like "-ing" or "-ed"), a lemmatizer considers the context and part of speech to return a proper dictionary word.

#### For example:

#### <li>Running → run </li>
#### <li>Better → good (lemmatization considers word meaning) </li>
#### <li>Was → be </li>

In [32]:
from nltk.stem import WordNetLemmatizer

In [34]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\rajda/nltk_data...


True

In [35]:
lem = nltk.WordNetLemmatizer()

In [44]:
text = "i am teaching in my classes."

In [39]:
for i in word_tokenize(text):
    print(lem.lemmatize(i), end = " ")

i am teaching in my class 

## Stemmer - lancasterstemer

In [42]:
from nltk.stem import LancasterStemmer

lanc = LancasterStemmer()

lanc.stem('running')

lanc.stem('Better')

'bet'

## POS Tag (Part of Speech Tagging)

#### POS tagging is the process of labeling words in a sentence with their correct part of speech (noun, verb, adjective, etc.) based on context.

## Example:
### Sentence:
#### 👉 "The quick brown fox jumps over the lazy dog."

### POS Tags:

#### <li>The → Determiner (DT) </li>
#### <li>quick → Adjective (JJ) </li>
#### <li>brown → Adjective (JJ) </li>
#### <li>fox → Noun (NN) </li>
#### <li>jumps → Verb (VBZ) </li>
#### <li>over → Preposition (IN) </li>
#### <li>the → Determiner (DT) </li>
#### <li>lazy → Adjective (JJ) </li>
#### <li>dog → Noun (NN) </li>

In [45]:
nltk.pos_tag(word_tokenize(text))

[('i', 'NN'),
 ('am', 'VBP'),
 ('teaching', 'VBG'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('classes', 'NNS'),
 ('.', '.')]