# Basic Text Preprocessing

In [3]:
import numpy as np
import pandas as pd

In [4]:
#About Dataset--
#IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
#This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
df=pd.read_csv('IMDB Dataset.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape

(50000, 2)

In [7]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [8]:
df.review[3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

# Lowercasing

In [9]:
#Selected portion
df.review[3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [10]:
#Whole column in the dataset
df.review.str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove HTML Tags

In [11]:
import re
def rem_html_tags(txt):
    pattern=re.compile('<.*?>') #Regular expression
    return pattern.sub(r'',txt)

In [12]:
df.review.apply(rem_html_tags)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove URL


In [13]:
import re
def remove_url(txt):
    pattern=re.compile('https?://\S+|www\.\S+')
    return pattern.sub(r'',txt)

In [14]:
df.review.apply(remove_url)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove Punctuation

In [15]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
punc_set=string.punctuation

In [17]:
def remove_punctuation(txt):
    for char in punc_set:
        txt=txt.replace(char,'')
    return txt

In [18]:
txt='text? with , punctuation!'


In [19]:
#How much time to execute
start=time.time()
print(remove_punctuation(txt))
t1=time.time()-start
print(t1)  #It;s slow

text with  punctuation
0.0


In [20]:
def rem_punc(txt):
    return txt.translate(str.maketrans('','',punc_set))

In [21]:
#How much time to execute
start=time.time()
print(rem_punc(txt))
t1=time.time()-start
print(t1)  #It;s fast

text with  punctuation
0.0


# Spelling Correction

In [22]:
from textblob import TextBlob


AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

In [None]:
incr_txt='I am 24 yeers olld.'
txtblb=TextBlob(incr_txt)
txtblb.correct().string

'I am 24 years old.'

In [None]:
incr_txt=df.review[3]
print(incr_txt)

Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.


In [None]:
txtblb=TextBlob(incr_txt)
txtblb.correct().string

"Basically there's a family where a little boy (Take) thinks there's a combine in his closet & his parents are fighting all the time.<br /><br />His movie is slower than a soap opera... and suddenly, Take decides to become Lumbo and kill the combine.<br /><br />of, first of all when you're going to make a film you must Decide if its a thrilled or a drama! Is a drama the movie is watchable. Parents are diverting & arguing like in real life. And then we have Take with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thrilled spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogue. Is for the shots with Take: just ignore them."

# Remove Stop Word

In [None]:
from nltk.corpus import stopwords
stopwords.words('bengali')

['অতএব',
 'অথচ',
 'অথবা',
 'অনুযায়ী',
 'অনেক',
 'অনেকে',
 'অনেকেই',
 'অন্তত',
 'অন্য',
 'অবধি',
 'অবশ্য',
 'অর্থাত',
 'আই',
 'আগামী',
 'আগে',
 'আগেই',
 'আছে',
 'আজ',
 'আদ্যভাগে',
 'আপনার',
 'আপনি',
 'আবার',
 'আমরা',
 'আমাকে',
 'আমাদের',
 'আমার',
 'আমি',
 'আর',
 'আরও',
 'ই',
 'ইত্যাদি',
 'ইহা',
 'উচিত',
 'উত্তর',
 'উনি',
 'উপর',
 'উপরে',
 'এ',
 'এঁদের',
 'এঁরা',
 'এই',
 'একই',
 'একটি',
 'একবার',
 'একে',
 'এক্',
 'এখন',
 'এখনও',
 'এখানে',
 'এখানেই',
 'এটা',
 'এটাই',
 'এটি',
 'এত',
 'এতটাই',
 'এতে',
 'এদের',
 'এব',
 'এবং',
 'এবার',
 'এমন',
 'এমনকী',
 'এমনি',
 'এর',
 'এরা',
 'এল',
 'এস',
 'এসে',
 'ঐ',
 'ও',
 'ওঁদের',
 'ওঁর',
 'ওঁরা',
 'ওই',
 'ওকে',
 'ওখানে',
 'ওদের',
 'ওর',
 'ওরা',
 'কখনও',
 'কত',
 'কবে',
 'কমনে',
 'কয়েক',
 'কয়েকটি',
 'করছে',
 'করছেন',
 'করতে',
 'করবে',
 'করবেন',
 'করলে',
 'করলেন',
 'করা',
 'করাই',
 'করায়',
 'করার',
 'করি',
 'করিতে',
 'করিয়া',
 'করিয়ে',
 'করে',
 'করেই',
 'করেছিলেন',
 'করেছে',
 'করেছেন',
 'করেন',
 'কাউকে',
 'কাছ',
 'কাছে',
 'কাজ',
 'কাজে',
 'কারও',
 '

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')


AttributeError: partially initialized module 'nltk' has no attribute 'internals' (most likely due to a circular import)

In [None]:
def rem_stop_words(txt):
    new_txt=[]
    for words in txt:
        if words in stopwords.words('english'):
            new_txt.append('')
        else:
            new_txt.append(words)
    x=new_txt[:]
    new_txt.clear()
    return " ".join(x)
    

In [23]:
df.review.apply(lambda x: rem_stop_words(x))

NameError: name 'rem_stop_words' is not defined

# Tokenization

##use split function

In [None]:
#Word Tokenize
txt1='I am a bad boy'
txt1.split()

['I', 'am', 'a', 'bad', 'boy']

In [None]:
#sentence tokenize
txt2='I am showman.I am from Chittagong.I am 24.'
txt2.split('.')


['I am showman', 'I am from Chittagong', 'I am 24', '']

In [None]:
#Now Problems with split function
txt3='Where are you going?I am singing.'
txt3.split('.')

['Where are you going?I am singing', '']

## NLTK

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
txt4='I am from delhi!'
word_tokenize(txt4)

['I', 'am', 'from', 'delhi', '!']

In [None]:
txt5='I have a M.sc. in A.I'
word_tokenize(txt5)


['I', 'have', 'a', 'M.sc', '.', 'in', 'A.I']

In [None]:
txt6='What are you doing man? I am dancing now.'
sent_tokenize(txt6)

['What are you doing man?', 'I am dancing now.']

# Spacy

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
doc1=nlp(txt5)
for token in doc1:
    print(token)

I
have
a
M.sc
.
in
A.I


In [None]:
doc1=nlp(txt2)
for token in doc1:
    print(token)

I
am
showman
.
I
am
from
Chittagong
.
I
am
24
.


In [None]:
sent='My email id is das@gmail.com'
doc2=nlp(sent)
for token in doc2:
    print(token)

My
email
i
d
is
das@gmail.com


# Stemming

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps=PorterStemmer()
def stem_words(txt):
    return " ".join([ps.stem(word) for word in txt.split()])

In [None]:
words='walk walks walking'
stem_words(words)

'walk walk walk'

In [None]:
txt='Send text online without worrying about phone bills. Free SMS to hundreds of GSM operators worldwide'
stem_words(txt)

'send text onlin without worri about phone bills. free sm to hundr of gsm oper worldwid'

# Lemmatizing

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
paragraph="""There's a thin line between likably old-fashioned and fuddy-duddy, and The Count of Monte Cristo ... never quite settles on either side.
The Rotten Tomatoes movie review dataset is a corpus of movie reviews used for sentiment analysis, originally collected by Pang and Lee [1]. In their work on sentiment treebanks, Socher et al. [2] used Amazon's Mechanical Turk to create fine-grained labels for all parsed phrases in the corpus. This competition presents a chance to benchmark your sentiment-analysis ideas on the Rotten Tomatoes dataset. You are asked to label phrases on a scale of five values: negative, somewhat negative, neutral, somewhat positive, positive. Obstacles like sentence negation, sarcasm, terseness, language ambiguity, and many others make this task very challenging."""


In [None]:
#convert the whole paragraph in sentences
sent=sent_tokenize(paragraph)
lemmatizer=WordNetLemmatizer()

In [None]:
#lemmatizing
for i in range(len(sent)):
    words=word_tokenize(sent[i])
    words=[lemmatizer.lemmatize(word) for word in words  if word not in set(stopwords.words('english'))]
    sent[i]= ' '.join(words)
    print(sent)

["There 's thin line likably old-fashioned fuddy-duddy , The Count Monte Cristo ... never quite settle either side .", 'The Rotten Tomatoes movie review dataset corpus movie review used sentiment analysis , originally collected Pang Lee [ 1 ] .', 'In work sentiment treebanks , Socher et al .', "[ 2 ] used Amazon 's Mechanical Turk create fine-grained label parsed phrase corpus .", 'This competition present chance benchmark sentiment-analysis idea Rotten Tomatoes dataset .', 'You asked label phrase scale five value : negative , somewhat negative , neutral , somewhat positive , positive .', 'Obstacles like sentence negation , sarcasm , terseness , language ambiguity , many others make task challenging .']
["There 's thin line likably old-fashioned fuddy-duddy , The Count Monte Cristo ... never quite settle either side .", 'The Rotten Tomatoes movie review dataset corpus movie review used sentiment analysis , originally collected Pang Lee [ 1 ] .', 'In work sentiment treebanks , Socher et