# Text Preprocessing

In [1]:
#Download and Install nltk using pip
!pip install nltk
!pip install textblob



In [None]:
#download all packages from nltk tool-kit
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Terminology
- Corpus
    - collection of texts
- Lexicon
    - vocabulary and its meaning
- Token
    - entity part of a rule based split

Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- text canonicalization
    - text canonicalization (tumor = tumour, it's = it is)

In [24]:
# Convert text to lowercase
input_str = "The 5 biggest countries by population in 2017 are China, India, United States, Indonesia, and Brazil."
input_str = input_str.lower()
print(input_str)


the 5 biggest countries by population in 2017 are china, india, united states, indonesia, and brazil.


In [27]:
# Remove numbers
import re
input_str = "Box A contains 3 red and 5 white balls, while Box B contains 4 red and 2 blue balls."
result = re.sub(r'\d+', '', input_str)
print(result)

Box A contains  red and  white balls, while Box B contains  red and  blue balls.


In [43]:
# Remove punctuation
from string import punctuation
text = "This &is [an] example? {of} string. with.? punctuation!!!!" # Sample string
def remove_punct(text):
    """
    take string input and clean string without punctuations.
    use regex to remove the punctuations.
    """
    return ''.join(c for c in text if c not in punctuation)

text = remove_punct(text)
print(text)

This is an example of string with punctuation


In [44]:
#remove whitespaces
input_str = "\t a string example\t"
input_str = input_str.strip()
input_str

'a string example'

### Tokenization
Tokenization is the process of splitting the given text into smaller pieces called tokens. Words, numbers, punctuation marks, and others can be considered as tokens. 

In [47]:
from nltk.tokenize import sent_tokenize, word_tokenize
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [48]:
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [46]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


In [49]:
from nltk.corpus import stopwords
set(stopwords.words("english"))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [52]:
# Removing stop words
# We don’t want these words to take up space in our database or take up valuable processing time. 
#Therefore, we call these words “useless words” because they are useless and we want to treat them. 
#Another version of the word “stop word” can be written more: the words we stop at.

input_str = "NLTK is a leading platform for building Python programs to work with human language data."
stop_words = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
result = [i for i in tokens if not i in stop_words]
print (result)

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


### Stemming
- Stemming is a process of reducing words to their word stem, base or root form (for example, books — book, looked — look). 
- Porter stemming algorithm (removes common morphological and inflexional endings from words 
- Lancaster stemming algorithm (a more aggressive stemming algorithm). 

In [54]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
input_str = "There are several types of stemming algorithms."
input_str = word_tokenize(input_str)
for word in input_str:
    print(stemmer.stem(word))

there
are
sever
type
of
stem
algorithm
.


### Lemmatization
- The aim of lemmatization, like stemming, is to reduce inflectional forms to a common base form. As opposed to stemming, lemmatization does not simply chop off inflections. Instead it uses lexical knowledge bases to get the correct base forms of words.



In [55]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
input_str = "been had done languages cities mice"
input_str = word_tokenize(input_str)
for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


### Part of speech tagging POS
- Part-of-speech tagging aims to assign parts of speech to each word of a given text (such as nouns, verbs, adjectives, and others) based on its definition and its context.

In [58]:
input_str = "Parts of speech examples: an article, to write, interesting, easily, and, of"
from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags)

[('Parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('examples', 'NNS'), ('an', 'DT'), ('article', 'NN'), ('to', 'TO'), ('write', 'VB'), ('interesting', 'VBG'), ('easily', 'RB'), ('and', 'CC'), ('of', 'IN')]


### Chunking (shallow parsing)
- Chunking is a natural language process that identifies constituent parts of sentences (nouns, verbs, adjectives, etc.) and links them to higher order units that have discrete grammatical meanings (noun groups or phrases, verb groups, etc.)

In [59]:
input_str = "A black television and a white stove were bought for the new apartment of John."
from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags)

[('A', 'DT'), ('black', 'JJ'), ('television', 'NN'), ('and', 'CC'), ('a', 'DT'), ('white', 'JJ'), ('stove', 'NN'), ('were', 'VBD'), ('bought', 'VBN'), ('for', 'IN'), ('the', 'DT'), ('new', 'JJ'), ('apartment', 'NN'), ('of', 'IN'), ('John', 'NNP')]


### Named entity recognition
- Named-entity recognition (NER) aims to find named entities in text and classify them into pre-defined categories (names of persons, locations, organizations, times, etc.).

In [63]:
from nltk import word_tokenize, pos_tag, ne_chunk
input_str = "Bill works for Apple so he went to Boston for a conference."
print(ne_chunk(pos_tag(word_tokenize(input_str))))

(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  Apple/NNP
  so/IN
  he/PRP
  went/VBD
  to/TO
  (GPE Boston/NNP)
  for/IN
  a/DT
  conference/NN
  ./.)
