# Basic Data Preprocessing in NLP

### Import libraries

In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load the Dataset

In [2]:
data = pd.read_csv("IMDB.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Total no. of Rows & Columns

In [3]:
data.shape

(50000, 2)

### Missing Values

In [4]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

### Duplicates Value

In [5]:
data.duplicated().sum()

418

### Drop off Duplicate values

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data.duplicated().sum()

0

### sort_index()

In [8]:
data['sentiment'].value_counts()

sentiment
positive    24884
negative    24698
Name: count, dtype: int64

In [9]:
data['sentiment'].value_counts().sort_index()

sentiment
negative    24698
positive    24884
Name: count, dtype: int64

Here, it is taking negative, positive values as an index so it indexing based on alphabetical order.

### To Remove HTML Tags 

In [10]:
def remove_html(text):
    clean_text = re.compile('<.*?>')
    return re.sub(clean_text,'',text)

In [11]:
eg = data['review'][1]
eg

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [12]:
result = remove_html(eg)
result

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

##### If want to apply in the whole dataset

In [13]:
data['review'] = data['review'].apply(remove_html)

In [14]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### To make into Lower Case

In [15]:
def lowerCase(text):
    return text.lower()

In [16]:
data['review'] = data['review'].apply(lowerCase)

In [17]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### To Remove Punctuation Marks

In [18]:
import string
exclude = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))

In [19]:
data['review'] = data['review'].apply(remove_punctuation)

In [20]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### To Remove Stopwords

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    words = text.split()
    stop_words = set(stopwords.words('english'))
    filter_words = [word for word in words if word.lower() not in stop_words]
    filter_sentences = ' '.join(filter_words)
    return filter_sentences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
data['review'] = data['review'].apply(remove_stopwords)

In [23]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


### word_tokenize()

In [24]:
def word_tokenize(text):
    return nltk.word_tokenize(text)

In [25]:
data['review'] = data['review'].apply(word_tokenize)

In [26]:
data.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive
