# Text Preprocessing

In [1]:
%pip install -q nltk
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

Note: you may need to restart the kernel to use updated packages.


True

## **Tokenization**

In [2]:
corpus = '''I am Spy D. Veloper. I am a developer by essence. I resemble Miguel O'Hara from Spider-Man: Across the Spiderverse'''

### Sentence Tokenization

In [3]:
from nltk.tokenize import sent_tokenize
sent_tokenize(corpus)

['I am Spy D. Veloper.',
 'I am a developer by essence.',
 "I resemble Miguel O'Hara from Spider-Man: Across the Spiderverse"]

### Word Tokenization

In [4]:
from nltk.tokenize import word_tokenize
word_tokenize(corpus)

['I',
 'am',
 'Spy',
 'D.',
 'Veloper',
 '.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence',
 '.',
 'I',
 'resemble',
 'Miguel',
 "O'Hara",
 'from',
 'Spider-Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

In [5]:
documents = sent_tokenize(corpus)
for sentence in documents:
    print(word_tokenize(sentence))

['I', 'am', 'Spy', 'D.', 'Veloper', '.']
['I', 'am', 'a', 'developer', 'by', 'essence', '.']
['I', 'resemble', 'Miguel', "O'Hara", 'from', 'Spider-Man', ':', 'Across', 'the', 'Spiderverse']


### WordPunct Tokenization

In [6]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)

['I',
 'am',
 'Spy',
 'D',
 '.',
 'Veloper',
 '.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence',
 '.',
 'I',
 'resemble',
 'Miguel',
 'O',
 "'",
 'Hara',
 'from',
 'Spider',
 '-',
 'Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

### TreeBank Word Tokenization

In [7]:
from nltk.tokenize import TreebankWordTokenizer
TreebankWordTokenizer().tokenize(corpus)

['I',
 'am',
 'Spy',
 'D.',
 'Veloper.',
 'I',
 'am',
 'a',
 'developer',
 'by',
 'essence.',
 'I',
 'resemble',
 'Miguel',
 "O'Hara",
 'from',
 'Spider-Man',
 ':',
 'Across',
 'the',
 'Spiderverse']

## **Stemming**

In [8]:
words = ['eating','eaten','eats','easily','fairly','playing','reading','doing','walking']

### PorterStemmer

In [9]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for word in words:
    print(f"{word} --> {ps.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easili
fairly --> fairli
playing --> play
reading --> read
doing --> do
walking --> walk


### RegExStemmer

In [10]:
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|e$|able$')
for word in words:
    print(f"{word} --> {rs.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easily
fairly --> fairly
playing --> play
reading --> read
doing --> do
walking --> walk


### Snowball Stemmer

In [11]:
from nltk.stem import SnowballStemmer
ss = SnowballStemmer('english')
for word in words:
    print(f"{word} --> {ss.stem(word)}")

eating --> eat
eaten --> eaten
eats --> eat
easily --> easili
fairly --> fair
playing --> play
reading --> read
doing --> do
walking --> walk


## Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for word in words:
    print(f"{word} --> {wnl.lemmatize(word, pos='v')}")

eating --> eat
eaten --> eat
eats --> eat
easily --> easily
fairly --> fairly
playing --> play
reading --> read
doing --> do
walking --> walk


## **Stopwords**

In [13]:
paragraph = '''I have a dream that one day this nation will rise up and live out the true meaning of its creed: "We hold these truths to be self-evident, that all men are created equal."

I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood.

I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.

I have a dream today!'''

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
ps = PorterStemmer()
sentences = sent_tokenize(paragraph)
for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [ps.stem(word) for word in words if word.lower() not in stop_words and word.isalpha()]
    sentences[i] = ' '.join(words)
    print(sentences[i])

dream one day nation rise live true mean creed hold truth men creat equal
dream one day red hill georgia son former slave son former slave owner abl sit togeth tabl brotherhood
dream one day even state mississippi state swelter heat injustic swelter heat oppress transform oasi freedom justic
dream four littl children one day live nation judg color skin content charact
dream today
