# Stage 1: Business Understanding

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import matplotlib.pyplot as plt

# Stage 2: Data Understanding

In [2]:
# df = pd.read_csv("Output.csv", sep='\t', error_bad_lines= False, nrows=1000) # testing for purpose only
df = pd.read_csv("Output.csv", sep='\t', error_bad_lines= False)
df.shape

(346355, 10)

# Stage 3: Data Preparation

In [3]:
df['review_class'] = [1 if x > 3 else 0 for x in df.overall]
df.head(2)

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,review_class
0,0,159985130X,"[1, 1]",5.0,This is a great little gadget to have around. ...,"01 5, 2011",ALC5GH8CAMAI7,AnnN,Handy little gadget,1294185600,1
1,1,159985130X,"[1, 1]",4.0,I would recommend this for a travel magnifier ...,"02 18, 2012",AHKSURW85PJUE,"AZ buyer ""AZ buyer""",Small & may need to encourage battery,1329523200,1


# Stage 4: Modeling

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df.reviewText, df.review_class, test_size=0.3, random_state=0)
 
print("x_train shape: {}".format(x_train.shape), end='\n')
print("y_train shape: {}".format(y_train.shape), end='\n\n')
print("x_test shape: {}".format(x_test.shape), end='\n')
print("y_test shape: {}".format(y_test.shape), end='\n\n')


# Convert a collection of raw documents to a matrix
vectorizer = TfidfVectorizer(decode_error='replace', encoding='utf-8', max_features=5000)
# Learn vocabulary and idf from training set.
vectorizer.fit(x_train.values.astype('U'))
# Learn vocabulary and idf, return term-document matrix. 
# Uses the vocabulary and document frequencies (df) learned by fit.
x_train_tv = vectorizer.transform(x_train.values.astype('U'))
x_test_tv = vectorizer.transform(x_test.values.astype('U'))

x_train shape: (242448,)
y_train shape: (242448,)

x_test shape: (103907,)
y_test shape: (103907,)



In [5]:
from keras.models import Sequential
from keras.layers import Dense, Embedding

model = Sequential()
model.add(Dense(units=6, kernel_initializer="uniform", activation='relu', input_dim=5000))
model.add(Dense(units=6, kernel_initializer="uniform", activation='relu'))
model.add(Dense(units=1, kernel_initializer="uniform", activation='sigmoid'))

Using TensorFlow backend.


As this is binary classification problem, we'll use binary_crossentropy loss function adn the Adam optimizer is usually good choice(feel fre to try others)

In [7]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
model.fit(x_train_tv, y_train, batch_size=100, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x105beb828>

# Stage 5: Evaluation


In [11]:
y_predict = model.predict(x_test_tv)

print('Test score:', y_predict)

Test score: [[0.99689937]
 [0.6822423 ]
 [0.99395436]
 ...
 [0.497269  ]
 [0.7349057 ]
 [0.9369279 ]]


ValueError: Classification metrics can't handle a mix of binary and continuous targets

# Stage 6: Deployment