In [103]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Setup Installing nltk
!pip install -q wordcloud
import wordcloud

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [104]:
import pandas as pd

amazon_file = 'amazon_file.csv'
df = pd.read_csv(amazon_file)

df.head()

Unnamed: 0,comments,0/1
0,"Addictive game Very good game, the graphics ar...",1
1,Where to start? I came on here because I wante...,0
2,Really Disappointed in this Film I have to say...,0
3,"Rubbish game When I tried to open it, it said ...",0
4,One Star Come on,0


# 1. Data Exploration

We have to make sure there aren't any null values.

In [105]:
pd.isnull(df).any()

comments    False
0/1         False
dtype: bool

At this point, we are going to figure out how many comments are labelled positive and negative.

In [106]:
df['0/1'].unique()

array([1, 0])

In [107]:
df['0/1'].value_counts()

0    5046
1    4953
Name: 0/1, dtype: int64

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  9999 non-null   object
 1   0/1       9999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


# 2. Data Cleaning

There are several stuff that we have to do before we start this sentiment analysis:
1. remove lowercase letters
2. remove punctuations
3. remove stop words in English

### Remove URL & html tags

In [109]:
import re

def remove_URL(text) :
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

def remove_html(text) :
    html= re.compile("<.*?>")
    return html.sub(r"",text)

In [110]:
df['comments'] = df.apply(lambda row: remove_URL(row['comments']), axis=1)
df['comments'] = df.apply(lambda row: remove_html(row['comments']), axis=1)

### Remove Emoji

In [132]:

import re


def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)



In [112]:
df['comments'] = df.apply(lambda row: deEmojify(row['comments']), axis=1)

In [113]:
df.head()

Unnamed: 0,comments,0/1
0,"Addictive game Very good game, the graphics ar...",1
1,Where to start? I came on here because I wante...,0
2,Really Disappointed in this Film I have to say...,0
3,"Rubbish game When I tried to open it, it said ...",0
4,One Star Come on,0


## 2.1 Removing Lowercase Letters

In [114]:
df['comments'] = df.apply(lambda row: str(row['comments']).lower(), axis=1)

In [115]:
df.head()

Unnamed: 0,comments,0/1
0,"addictive game very good game, the graphics ar...",1
1,where to start? i came on here because i wante...,0
2,really disappointed in this film i have to say...,0
3,"rubbish game when i tried to open it, it said ...",0
4,one star come on,0


## 2.2 Removing Punctuations

In [116]:
from string import punctuation
def remove_punctuations(string):
    return ''.join(c for c in string if c not in punctuation)

In [117]:
df['comments'] = df.apply(lambda row: remove_punctuations(row['comments']), axis=1)

In [118]:
df.head()

Unnamed: 0,comments,0/1
0,addictive game very good game the graphics are...,1
1,where to start i came on here because i wanted...,0
2,really disappointed in this film i have to say...,0
3,rubbish game when i tried to open it it said i...,0
4,one star come on,0


## 2.3 Removing Stopwords & Stemming

In [119]:
 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

stop_words = set(stopwords.words('english'))

def remove_stopwords(string):    
    tokenized = word_tokenize(string)
    filtered_sentence = [word for word in tokenized if not word in stop_words]
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_sentence]
    return ' '.join(c for c in stemmed)
 

In [120]:
df['comments'] = df.apply(lambda row: remove_stopwords(row['comments']), axis=1)


In [121]:
df.head()

Unnamed: 0,comments,0/1
0,addict game good game graphic brilliant great ...,1
1,start came want get flappi bird saw thought ba...,0
2,realli disappoint film say look forward remak ...,0
3,rubbish game tri open said stoppedfix higher rate,0
4,one star come,0


In [122]:
def convert(integer):
    if(integer == 1):
        return 'Positive'
    else:
        return 'Negative'

In [123]:
df['0/1'] = df.apply(lambda row: convert(row['0/1']), axis=1)

In [124]:
from sklearn.model_selection import train_test_split

X = df['comments']
y = df['0/1']



In [125]:
one_hot_encoded_label = pd.get_dummies(y)
one_hot_encoded_label.head()

Unnamed: 0,Negative,Positive
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

### Count Vectorizer

In [127]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=2, ngram_range=(1, 1))
X_train = vect.fit(X_train).transform(X_train) 
X_test = vect.transform(X_test)

### Logistic Regression

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

c_val = [0.05,0.1,0.2,0.25,0.5,0.75, 1]

for c in c_val:
    logreg = LogisticRegression(C=c)
    logreg.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logreg.predict(X_test))))

Accuracy for C=0.05: 0.896
Accuracy for C=0.1: 0.896
Accuracy for C=0.2: 0.8972
Accuracy for C=0.25: 0.896
Accuracy for C=0.5: 0.8976
Accuracy for C=0.75: 0.8952
Accuracy for C=1: 0.8924


### Support Vector Machine

In [129]:
from sklearn.svm import SVC
classifier = SVC(random_state = 10)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm
accuracy_score(y_test, y_pred)

0.898

### Decision Tree Classifier

In [130]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion ='entropy', max_depth = 10, random_state = 2)
dt_classifier.fit(X_train, y_train)

pred_train = dt_classifier.predict(X_train)
pred_test = dt_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print('Training Accuracy: ', accuracy_score(y_train, pred_train))
print('Testing Accuracy: ', accuracy_score(y_test, pred_test))

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

Training Accuracy:  0.8153087078277104
Testing Accuracy:  0.7912
(7499,)
(7499,)
(2500,)
(2500,)


### XGBoost Classifier

In [131]:
import xgboost as xgb
xg_classifier = xgb.XGBClassifier(n_estimators = 20)
xg_classifier.fit(X_train, y_train)
pred_train = xg_classifier.predict(X_train)
pred_test = xg_classifier.predict(X_test)



from sklearn.metrics import accuracy_score
print('Training Accuracy: ', accuracy_score(y_train, pred_train))
print('Testing Accuracy: ', accuracy_score(y_test, pred_test))



Training Accuracy:  0.549406587545006
Testing Accuracy:  0.4904
