<b>Source</b> : https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908

### Convert to Lowercase

In [1]:
input_str = "The 5 biggest countries by population in 2017 are China, India, United States, Indonesia, and Brazil."
input_str = input_str.lower()
print(input_str)

### Remove numbers

In [2]:
import re

input_str = "Box A contains 3 red and 5 white balls, while Box B contains 4 red and 2 blue balls."
result = re.sub(r'\d+', '', input_str)
print(result)

Box A contains  red and  white balls, while Box B contains  red and  blue balls.


### Remove punctuation

In [3]:
import string

input_str = "This &is [an] example? {of} string. with.? punctuation!!!!" # Sample string
result = input_str.translate(str.maketrans("","", string.punctuation))
print(result)

This is an example of string with punctuation


### Remove whitespaces

In [4]:
input_str = "\t a string example\t"
input_str = input_str.strip()
input_str

'a string example'

### Remove stop words

In [12]:
from nltk.corpus import stopwords
input_str = "NLTK is a leading platform for building Python programs to work with human language data."
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
result = [i for i in tokens if not i in stop_words]
print (result)

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


### Stemming

In [14]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer= PorterStemmer()
input_str= "There are several types of stemming algorithms."
input_str=word_tokenize(input_str)
for word in input_str:
    print(stemmer.stem(word))

there
are
sever
type
of
stem
algorithm
.


### Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer=WordNetLemmatizer()
input_str= "been had done languages cities mice"
input_str=word_tokenize(input_str)
for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


### Part of speech tagging (POS)

In [22]:
import nltk
nltk.download('averaged_perceptron_tagger')

input_str="Parts of speech examples: an article, to write, interesting, easily, and, of"

from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags)

[('Parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('examples', 'NNS'), ('an', 'DT'), ('article', 'NN'), ('to', 'TO'), ('write', 'VB'), ('interesting', 'VBG'), ('easily', 'RB'), ('and', 'CC'), ('of', 'IN')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mahasiswa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Chunking (shallow parsing)

In [23]:
input_str="A black television and a white stove were bought for the new apartment of John."

from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags)

[('A', 'DT'), ('black', 'JJ'), ('television', 'NN'), ('and', 'CC'), ('a', 'DT'), ('white', 'JJ'), ('stove', 'NN'), ('were', 'VBD'), ('bought', 'VBN'), ('for', 'IN'), ('the', 'DT'), ('new', 'JJ'), ('apartment', 'NN'), ('of', 'IN'), ('John', 'NNP')]


In [24]:
reg_exp = "NP: {<DT>?<JJ>*<NN>}"
rp = nltk.RegexpParser(reg_exp)
result = rp.parse(result.tags)
print(result)

(S
  (NP A/DT black/JJ television/NN)
  and/CC
  (NP a/DT white/JJ stove/NN)
  were/VBD
  bought/VBN
  for/IN
  (NP the/DT new/JJ apartment/NN)
  of/IN
  John/NNP)


In [25]:
result.draw()

### Named entity recognition

In [31]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk import word_tokenize, pos_tag, ne_chunk

input_str = "Bill works for Apple so he went to Boston for a conference."
print(ne_chunk(pos_tag(word_tokenize(input_str))))

(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  Apple/NNP
  so/IN
  he/PRP
  went/VBD
  to/TO
  (GPE Boston/NNP)
  for/IN
  a/DT
  conference/NN
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Mahasiswa\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Mahasiswa\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


### Collocation extraction

In [37]:
input=["he and Chazz duel with all keys on the line."]

from ICE import CollocationExtractor
extractor = CollocationExtractor.with_collocation_pipeline("T1" , bing_key = "Temp",pos_check = False)
print(extractor.get_collocations_of_length(input, length = 3))

ModuleNotFoundError: No module named 'ICE'