# **Text Preprocesing**

In [2]:
%pip install nltk

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Using cached click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2026.1.15-cp314-cp314-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Downloading regex-2026.1.15-cp314-cp314-win_amd64.whl (280 kB)
Using cached click-8.3.1-py3-none-any.whl (108 kB)
Downloading joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk

   ---------------------------------------- 0/5 [tqdm]
   -------- ------------------------------- 1/5 [regex]
   ---------------- ----------------------- 2/5 [joblib]
   ---------------- ---------------------

## **Tokenization**

In [1]:
corpus="""I am Spy D. Veloper. I am a developer by essence. I resemble Miguel O'Hara from Spider-Man: Across the Spiderverse
"""

### Sentence Tokenization

In [13]:
from nltk.tokenize import sent_tokenize

In [4]:
sent_tokenize(corpus)

['I am Spy D. Veloper.',
 'I am a developer by essence.',
 "I resemble Miguel O'Hara from Spider-Man: Across the Spiderverse"]

### Word Tokenization

In [14]:
from nltk.tokenize import word_tokenize

In [5]:
word_tokenize(corpus)

['I',
 'am',
 'Spy',
 'D.',
 'Veloper',
 '.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence',
 '.',
 'I',
 'resemble',
 'Miguel',
 "O'Hara",
 'from',
 'Spider-Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

In [8]:
documents = sent_tokenize(corpus)
for sentence in documents:
    # print(sentence)
    print(word_tokenize(sentence))

['I', 'am', 'Spy', 'D.', 'Veloper', '.']
['I', 'am', 'a', 'developer', 'by', 'essence', '.']
['I', 'resemble', 'Miguel', "O'Hara", 'from', 'Spider-Man', ':', 'Across', 'the', 'Spiderverse']


### WordPunct Tokenization

In [10]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)

['I',
 'am',
 'Spy',
 'D',
 '.',
 'Veloper',
 '.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence',
 '.',
 'I',
 'resemble',
 'Miguel',
 'O',
 "'",
 'Hara',
 'from',
 'Spider',
 '-',
 'Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

### TreeBank Word Tokenization

In [12]:
from nltk.tokenize import TreebankWordTokenizer
treebank_word_tokenize = TreebankWordTokenizer().tokenize
treebank_word_tokenize(corpus)

['I',
 'am',
 'Spy',
 'D.',
 'Veloper.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence.',
 'I',
 'resemble',
 'Miguel',
 "O'Hara",
 'from',
 'Spider-Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

## **Stemming**

In [16]:
words=["eating","eaten","eats","easily","fairly","playing","reading","doing","walking"]

### PorterStemmer

In [15]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [18]:
for word in words:
    print(f"{word} --> {ps.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easili
fairly --> fairli
playing --> play
reading --> read
doing --> do
walking --> walk


### RegExStemmer

In [19]:
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|e$|able$')

In [20]:
for word in words:
    print(f"{word} --> {rs.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easily
fairly --> fairly
playing --> play
reading --> read
doing --> do
walking --> walk


### Snowball Stemmer

In [18]:
from nltk.stem import SnowballStemmer
ss = SnowballStemmer('english')

In [23]:
for word in words:
    print(f"{word} --> {ss.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easili
fairly --> fair
playing --> play
reading --> read
doing --> do
walking --> walk


## Lemmatization

In [24]:
words=["eating","eaten","eats","easily","fairly","playing","reading","doing","walking"]

In [26]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [27]:
for word in words:
    print(f"{word} --> {wnl.lemmatize(word, pos='v')}")

eating --> eat
eaten --> eat
eats --> eat
easily --> easily
fairly --> fairly
playing --> play
reading --> read
doing --> do
walking --> walk


## **Stopwords**

In [1]:
paragraph = """I have a dream that one day this nation will rise up and live out the true meaning of its creed: "We hold these truths to be self-evident, that all men are created equal." 

I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood.

I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.

I have a dream today!"""

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cherr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [19]:
sentences = nltk.sent_tokenize(paragraph)
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [ss.stem(word) for word in words if word.lower() not in stopwords.words('english')]
    sentences[i] = ' '.join(words) 
    print(sentences[i])

dream one day nation rise live true mean creed : `` hold truth self-evid , men creat equal . ''
dream one day red hill georgia , son former slave son former slave owner abl sit togeth tabl brotherhood .
dream one day even state mississippi , state swelter heat injustic , swelter heat oppress , transform oasi freedom justic .
dream four littl children one day live nation judg color skin content charact .
dream today !
