In [1]:
import nltk
#nltk.download()
##### python -m nltk.downloader all
##### python -m nltk.downloader -d /usr/local/share/nltk_data all


In [2]:
nltk.__version__

'3.2.4'

# Text Cleaning
The data scraped from the website is mostly in the raw text form same applies for other unstructured data. The data needs to be cleaned before analyzing it or fitting a model to it. Cleaning or pre-processing the data consists of a few steps based on the data and the requirement.


# 1. Removing extra spaces
Most of the time the text data that you have may contain extra spaces in between the words, after or before a sentence. So to start with we will remove these extra spaces from each sentence by using regular expressions.

In [4]:
import re
doc = "NLP  is an interesting     field.  "
new_doc = re.sub("\s+"," ", doc)
print(new_doc)

NLP is an interesting field. 


# 2. Removing Punctuations
Sometimes (not always) punctuations do not add any value to data. During such times its better to remove the punctuations

In [6]:
"I like NLP." == 'I like NLP'

False

In [7]:
text = "Hello! How are you!! I'm very excited that you're going for a trip to Europe!! Yayy!"
re.sub("[^-9A-Za-z ]", "" , text)

'Hello How are you Im very excited that youre going for a trip to Europe Yayy'

In [8]:
# Punctuations can also be removed by using a package from the string library.
import string
text = "Hello! How are you!! I'm very excited that you're going for a trip to Europe!! Yayy!"
text_clean = "".join([i for i in text if i not in string.punctuation])
text_clean

'Hello How are you Im very excited that youre going for a trip to Europe Yayy'

# 3. Case Normalization
Convert the case of all characters in the text to either upper or lower case. As python is a case sensitive language so it will treat NLP and nlp differently.

In [9]:
import string
text = "Hello! How are you!! I'm very excited that you're going for a trip to Europe!! Yayy!"
text_clean = "".join([i.lower() for i in text if i not in string.punctuation])
text_clean

'hello how are you im very excited that youre going for a trip to europe yayy'

# 4. Tokenization: 
Splitting a sentence into words and creating a list, ie each sentence is a list of words.
Three types of tokenization
- Word Tokenization 

In [10]:
text = "Hello! How are you!! I'm very excited that you're going for a trip to Europe!! Yayy!"
nltk.tokenize.word_tokenize(text)

['Hello',
 '!',
 'How',
 'are',
 'you',
 '!',
 '!',
 'I',
 "'m",
 'very',
 'excited',
 'that',
 'you',
 "'re",
 'going',
 'for',
 'a',
 'trip',
 'to',
 'Europe',
 '!',
 '!',
 'Yayy',
 '!']

- Tweet tokenization
specifically used while dealing with text data from social media consisting of #,@, emoticons.

In [11]:
text = "Hello! How are you!! I'm very excited that you're going for a trip to Europe!! Yayy!"
from nltk.tokenize import TweetTokenizer
tweet = TweetTokenizer()
tweet.tokenize(text)

['Hello',
 '!',
 'How',
 'are',
 'you',
 '!',
 '!',
 "I'm",
 'very',
 'excited',
 'that',
 "you're",
 'going',
 'for',
 'a',
 'trip',
 'to',
 'Europe',
 '!',
 '!',
 'Yayy',
 '!']

- regexp_tokenize: 
It can be used when we want to separate words of our interests which follows a common pattern like extracting all hashtags from tweets, addresses from tweets, or hyperlinks from the text. In this, you can use the normal regular expression functions to separate the words.

In [13]:
import re
a = 'What are your views related to US elections @nitin'
re.split('\s@', a)

['What are your views related to US elections', 'nitin']

# Stopwords
Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc.

In [14]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
from nltk.corpus import stopwords
text="Today is a great day. It is even better than yesterday. And yesterday was the best day ever!"
stopwords=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
words=word_tokenize(text)
wordsFiltered=[]
for w in words:
        if w not in stopwords:
                 wordsFiltered.append(w)
wordsFiltered

['Today',
 'great',
 'day',
 '.',
 'It',
 'even',
 'better',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'best',
 'day',
 'ever',
 '!']

# Lemmatization & Stemming
Final step of pre-processing is usually Lemmatization & Stemming