In [120]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## change default settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [121]:
reviews = pd.read_csv('reviews.csv')
reviews.head()

Unnamed: 0.1,Unnamed: 0,content
0,0,Very good
1,1,can't wait to watch it
2,2,Your ugly
3,3,Owen W0
4,4,thank you will take care of this matter


In [122]:
mydata = reviews.drop('Unnamed: 0', axis=1)
mydata.head()

Unnamed: 0,content
0,Very good
1,can't wait to watch it
2,Your ugly
3,Owen W0
4,thank you will take care of this matter


In [123]:
mydata.rename(columns = {'content':'review'}, inplace = True)
mydata.head(5)

Unnamed: 0,review
0,Very good
1,can't wait to watch it
2,Your ugly
3,Owen W0
4,thank you will take care of this matter


In [124]:
# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

# Cleaning the text in the review column
mydata['Cleaned Reviews'] = mydata['review'].apply(clean)
mydata.head()

Unnamed: 0,review,Cleaned Reviews
0,Very good,Very good
1,can't wait to watch it,can t wait to watch it
2,Your ugly,Your ugly
3,Owen W0,Owen W
4,thank you will take care of this matter,thank you will take care of this matter


##### Tokenization, POS tagging, Stopwords removal

In [125]:
import nltk
nltk.download('omw-1.4')
nltk.download('point')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Error loading point: Package 'point' not found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
stop_words = stopwords.words('english')
mydata['stopword_coun'] = mydata['review'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
mydata['stopword_coun']

0      0 
1      2 
2      0 
3      0 
4      4 
      .. 
496    6 
497    22
498    0 
499    11
500    0 
Name: stopword_coun, Length: 501, dtype: int64

In [127]:
mydata.describe()

Unnamed: 0,stopword_coun
count,501.0
mean,3.323353
std,5.806482
min,0.0
25%,0.0
50%,1.0
75%,4.0
max,44.0


In [128]:
mydata.head()

Unnamed: 0,review,Cleaned Reviews,stopword_coun
0,Very good,Very good,0
1,can't wait to watch it,can t wait to watch it,2
2,Your ugly,Your ugly,0
3,Owen W0,Owen W,0
4,thank you will take care of this matter,thank you will take care of this matter,4


In [129]:
# Lower case all words
mydata['review_lower'] = mydata['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [130]:
# Remove Punctuation
mydata['review_nopunc'] = mydata['review_lower'].str.replace('[^\w\s]', '')

In [131]:
# Import stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Remove Stopwords
mydata['review_nopunc_nostop'] = mydata['review_nopunc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
mydata.head()

Unnamed: 0,review,Cleaned Reviews,stopword_coun,review_lower,review_nopunc,review_nopunc_nostop
0,Very good,Very good,0,very good,very good,good
1,can't wait to watch it,can t wait to watch it,2,can't wait to watch it,cant wait to watch it,cant wait watch
2,Your ugly,Your ugly,0,your ugly,your ugly,ugly
3,Owen W0,Owen W,0,owen w0,owen w0,owen w0
4,thank you will take care of this matter,thank you will take care of this matter,4,thank you will take care of this matter,thank you will take care of this matter,thank take care matter


In [147]:
# Return frequency of values
freq= pd.Series(" ".join(mydata['review_nopunc_nostop']).split()).value_counts()
freq

love           129
app            98 
disney         96 
great          80 
good           69 
               .. 
hasnt          1  
glitched       1  
magnificent    1  
hwve           1  
ecxelent       1  
Length: 881, dtype: int64

In [148]:
other_stopwords = ['get', 'us', 'see', 'use', 'said', 'asked', 'day', 'go' \
  'even', 'ive', 'right', 'left', 'always', 'would', 'told', \
  'get', 'us', 'would', 'get', 'one', 'ive', 'go', 'even', \
  'also', 'ever', 'x', 'take', 'let' ]
mydata['lemma'] = mydata['review_nopunc_nostop'].apply(lambda x: "".join(" ".join(x for x in x.split() if x not in other_stopwords)))

In [149]:
mydata

Unnamed: 0,review,Cleaned Reviews,stopword_coun,review_lower,review_nopunc,review_nopunc_nostop,lemma
0,Very good,Very good,0,very good,very good,good,good
1,can't wait to watch it,can t wait to watch it,2,can't wait to watch it,cant wait to watch it,cant wait watch,cant wait watch
2,Your ugly,Your ugly,0,your ugly,your ugly,ugly,ugly
3,Owen W0,Owen W,0,owen w0,owen w0,owen w0,owen w0
4,thank you will take care of this matter,thank you will take care of this matter,4,thank you will take care of this matter,thank you will take care of this matter,thank take care matter,thank care matter
...,...,...,...,...,...,...,...
496,they put all of the Star Wars movies and TV shows in storyline order,they put all of the Star Wars movies and TV shows in storyline order,6,they put all of the star wars movies and tv shows in storyline order,they put all of the star wars movies and tv shows in storyline order,put star wars movies tv shows storyline order,put star wars movies tv shows storyline order
497,I've had this app for 2 years now and all of a sudden i can't watch anything cuz it keeps say error 83. I try and fix it but still nothing. My kids and myself love this app but now what do we do,I ve had this app for years now and all of a sudden i can t watch anything cuz it keeps say error I try and fix it but still nothing My kids and myself love this app but now what do we do,22,i've had this app for 2 years now and all of a sudden i can't watch anything cuz it keeps say error 83. i try and fix it but still nothing. my kids and myself love this app but now what do we do,ive had this app for 2 years now and all of a sudden i cant watch anything cuz it keeps say error 83 i try and fix it but still nothing my kids and myself love this app but now what do we do,ive app 2 years sudden cant watch anything cuz keeps say error 83 try fix still nothing kids love app,app 2 years sudden cant watch anything cuz keeps say error 83 try fix still nothing kids love app
498,Amazing,Amazing,0,amazing,amazing,amazing,amazing
499,"2/3rds the price of Netflix and way better stuff to watch even if you don't like Marvel or Star Wars because they also own Fox, ABC, others...",rds the price of Netflix and way better stuff to watch even if you don t like Marvel or Star Wars because they also own Fox ABC others,11,"2/3rds the price of netflix and way better stuff to watch even if you don't like marvel or star wars because they also own fox, abc, others...",23rds the price of netflix and way better stuff to watch even if you dont like marvel or star wars because they also own fox abc others,23rds price netflix way better stuff watch even dont like marvel star wars also fox abc others,23rds price netflix way better stuff watch dont like marvel star wars fox abc others


In [150]:
mydata[['review','lemma']]

Unnamed: 0,review,lemma
0,Very good,good
1,can't wait to watch it,cant wait watch
2,Your ugly,ugly
3,Owen W0,owen w0
4,thank you will take care of this matter,thank care matter
...,...,...
496,they put all of the Star Wars movies and TV shows in storyline order,put star wars movies tv shows storyline order
497,I've had this app for 2 years now and all of a sudden i can't watch anything cuz it keeps say error 83. I try and fix it but still nothing. My kids and myself love this app but now what do we do,app 2 years sudden cant watch anything cuz keeps say error 83 try fix still nothing kids love app
498,Amazing,amazing
499,"2/3rds the price of Netflix and way better stuff to watch even if you don't like Marvel or Star Wars because they also own Fox, ABC, others...",23rds price netflix way better stuff watch dont like marvel star wars fox abc others


## Sentiment Analysis using TextBlob:

In [151]:
from textblob import TextBlob

def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
    # function to calculate polarity
def getPolarity(review):
        return TextBlob(review).sentiment.polarity
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [152]:
fin_data = pd.DataFrame(mydata[['review', 'lemma']])


In [153]:
# fin_data['Subjectivity'] = fin_data['Lemma'].apply(getSubjectivity) 
fin_data['Polarity'] = fin_data['lemma'].apply(getPolarity) 
fin_data['Analysis'] = fin_data['Polarity'].apply(analysis)
fin_data

Unnamed: 0,review,lemma,Polarity,Analysis
0,Very good,good,0.70,Positive
1,can't wait to watch it,cant wait watch,0.00,Neutral
2,Your ugly,ugly,-0.70,Negative
3,Owen W0,owen w0,0.00,Neutral
4,thank you will take care of this matter,thank care matter,0.00,Neutral
...,...,...,...,...
496,they put all of the Star Wars movies and TV shows in storyline order,put star wars movies tv shows storyline order,0.00,Neutral
497,I've had this app for 2 years now and all of a sudden i can't watch anything cuz it keeps say error 83. I try and fix it but still nothing. My kids and myself love this app but now what do we do,app 2 years sudden cant watch anything cuz keeps say error 83 try fix still nothing kids love app,0.25,Positive
498,Amazing,amazing,0.60,Positive
499,"2/3rds the price of Netflix and way better stuff to watch even if you don't like Marvel or Star Wars because they also own Fox, ABC, others...",23rds price netflix way better stuff watch dont like marvel star wars fox abc others,0.50,Positive


In [154]:
tb_counts = fin_data.Analysis.value_counts()

tb_counts

Positive    398
Neutral     93 
Negative    10 
Name: Analysis, dtype: int64