# Import the Required Packages

In [56]:
import numpy as np
import pandas as pd
# For visualizations
import matplotlib.pyplot as plt
# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math
#for natural language processing
import nltk

In [42]:
# Importing dataset
df=pd.read_excel('Iphone_4s_Reviews.xlsx') 
print("Shape of data=>",df.shape)

Shape of data=> (36440, 2)


In [43]:
df

Unnamed: 0,Ratings,Reviews
0,5,Big billion day makes this phone even more val...
1,5,Every thing is fine n more then expected...
2,2,Has many drawbacks. Apps developed for iPhone ...
3,5,The phone was in superb condition... im lovin ...
4,5,Great Product.
...,...,...
36435,5,This product is just amazingðŸ˜™
36436,5,Very tactile\n \n \n\n\n\n
36437,5,Very good according to price and features!! On...
36438,5,Original!! Rest no need to explain.. Apple Nam...


# Remove the Null, missing values

In [44]:
df.isnull().sum()

Ratings      0
Reviews    755
dtype: int64

In [45]:
df.dropna(inplace=True)

In [46]:
df.isnull().sum()

Ratings    0
Reviews    0
dtype: int64

In [47]:
df

Unnamed: 0,Ratings,Reviews
0,5,Big billion day makes this phone even more val...
1,5,Every thing is fine n more then expected...
2,2,Has many drawbacks. Apps developed for iPhone ...
3,5,The phone was in superb condition... im lovin ...
4,5,Great Product.
...,...,...
36435,5,This product is just amazingðŸ˜™
36436,5,Very tactile\n \n \n\n\n\n
36437,5,Very good according to price and features!! On...
36438,5,Original!! Rest no need to explain.. Apple Nam...


# Labelling Reviews:

Now we have 35,685 reviews. The reviews with star rating 4,5,3 are labelled as positive reviews and 1,2 are labelled as negative reviews.

In [48]:
df['Ratings']=df['Ratings'].astype(int) #convert the star_rating column to int
df=df[df['Ratings']!=0]
df['label']=np.where(df['Ratings']>=3,1,0) #1-Positve,0-Negative

In [49]:
df

Unnamed: 0,Ratings,Reviews,label
0,5,Big billion day makes this phone even more val...,1
1,5,Every thing is fine n more then expected...,1
2,2,Has many drawbacks. Apps developed for iPhone ...,0
3,5,The phone was in superb condition... im lovin ...,1
4,5,Great Product.,1
...,...,...,...
36435,5,This product is just amazingðŸ˜™,1
36436,5,Very tactile\n \n \n\n\n\n,1
36437,5,Very good according to price and features!! On...,1
36438,5,Original!! Rest no need to explain.. Apple Nam...,1


# Pre-Processing

In [50]:
# convert the all reviews into the lower case.

In [51]:
data['pre_process'] = data['Reviews'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

In [52]:
# Remove the HTML tags and URLs from the reviews.

In [53]:
from bs4 import BeautifulSoup
data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
import re
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r"http\S+", "", x))

In [54]:
# Perform the Contractions on the reviews.

In [55]:
def contractions(s):
 s = re.sub(r"won't", "will not",s)
 s = re.sub(r"would't", "would not",s)
 s = re.sub(r"could't", "could not",s)
 s = re.sub(r"\'d", "would",s)
 s = re.sub(r"can\'t", "can not",s)
 s = re.sub(r"n\'t", " not", s)
 s= re.sub(r"\'re", " are", s)
 s = re.sub(r"\'s", " is", s)
 s = re.sub(r"\'ll", " will", s)
 s = re.sub(r"\'t", " not", s)
 s = re.sub(r"\'ve", " have", s)
 s = re.sub(r"\'m", " am", s)
 return s
data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))

# Remove non-alpha characters

In [57]:
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove the extra spaces between the words

In [59]:
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))

# Remove the stop words by using the NLTK package

In [61]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

# Perform lemmatization using the wordnet lemmatizer

In [62]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [63]:
data

Unnamed: 0,Ratings,Reviews,label,pre_process
0,1,Such waste mobile from apple. It will hang and...,0,waste mobile apple hang u use app space issue ...
1,1,Within a month charger didn't work ......,0,within month charger work
2,1,Battery not working properly gets discharged b...,0,battery working properly get discharged hour use
3,2,Planning to order iphone 4sS Hows the bat...,0,planning order iphone s hows battery life comp...
4,1,4 inches.\n \n 2013 design.\n \n No NFC.\n...,0,inch design nfc bluetooth wireless charging ba...
...,...,...,...,...
19403,5,iPhone is always master in mobiles. It's reall...,1,iphone always master mobile really amazing tha...
19404,4,"Battery has been the problem, 10% battery heal...",1,battery problem battery health month buy phone...
19405,5,Just have it to feel it\n \n \n\n\n\n,1,feel
19406,5,Well for a Day I was on Gas as my delivery for...,1,well day gas delivery delayed day already paid...
