In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
# importing Natural Language Toolkit 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
pd.options.mode.chained_assignment = None

In [5]:
data = pd.read_csv("./Complete.csv",low_memory=False)

In [6]:
data =  data[['brand','categories','manufacturer','reviews.username','reviews.title','reviews.text','reviews.numHelpful',
                   'reviews.rating']]

In [7]:
data.head()

Unnamed: 0,brand,categories,manufacturer,reviews.username,reviews.title,reviews.text,reviews.numHelpful,reviews.rating
0,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Amazon,Adapter,Kindle,This product so far has not disappointed. My c...,0.0,5.0
1,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Amazon,truman,very fast,great for beginner or experienced person. Boug...,0.0,5.0
2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Amazon,DaveZ,Beginner tablet for our 9 year old son.,Inexpensive tablet for him to use and learn on...,0.0,5.0
3,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Amazon,Shacks,Good!!!,I've had my Fire HD 8 two weeks now and I love...,0.0,4.0
4,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Amazon,explore42,Fantastic Tablet for kids,I bought this for my grand daughter when she c...,0.0,5.0


In [28]:
file = data[data["reviews.rating"]==1.0]

In [31]:
file["reviews.text"].iloc[0]

'i Bought this around black friday for $60 hoping it would be awesome... it failed so hard i tried multiple different micro SD cards none of which were recognized and YES i formated them with every format i could think of ... Fat32, NTFS, Fat, Xfat... i even tried to have the tablet do it... didnt work... to make matters worse half the apps i wanted to use werent in the app store and i came to find out that it isnt linked to the normal google play store this tablet has its own app store which is missing many common apps... the main reason i bought this was to play clash of clans and i cant because it wasnt on the app store... i tried to also use aftermarket play stores to play COC but it didnt work... launched and played 1 time but didnt work or update after that... needless to say i returned it and bought a $250 samsung galaxy tab A 10.1 (2016 version) with S-pen and its WAYYYYY better... bottom line you get what you pay for... also hint the s-pen version has an extra 1 GB of ram over

In [8]:
data['reviews.clean'] = data['reviews.text'].str.replace('\d+', '')
data = data.dropna()

In [13]:
from nltk.corpus import stopwords
stopset = set(stopwords.words("english"))
vectorizer = TfidfVectorizer(stop_words=stopset,binary=True)

In [14]:
X = vectorizer.fit_transform(data['reviews.clean'])
y = data['reviews.rating'].map({1.0:0,2.0:0,3.0:0,4.0:1,5.0:1})

<34127x13078 sparse matrix of type '<class 'numpy.float64'>'
	with 471824 stored elements in Compressed Sparse Row format>

In [19]:
y.value_counts()

1    31889
0     2238
Name: reviews.rating, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80, random_state=42)

In [16]:
# Import the models from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

# Initialize the three models
model = MultinomialNB(alpha=1.0,fit_prior=True)

model.fit(X_train,y_train)
print(f1_score(y_test,model.predict(X_test)))

0.9657471961200363


In [13]:
import pickle
from sklearn.externals import joblib

# save vertorizer information
with open('TfidfVectorizerModel.pkl','wb') as TfidfVectorizerModel:
    pickle.dump(vectorizer.vocabulary_, TfidfVectorizerModel)

# save trained model
with open('MultinomialNBModel.pkl','wb') as MultinomialNBModel:
    joblib.dump(model, MultinomialNBModel)

In [28]:
%%writefile score.py
# import libraries
import json, os, pickle
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# define init function
def init():
    # define global variable
    global vocab, model, trainedVectorizer, transformer

    # load machine learning model and meta data
    vocab =  pickle.load(open('TfidfVectorizerModel.pkl', 'rb'))
    model = joblib.load(open('MultinomialNBModel.pkl','rb'))

    # init stages using meta data
    trainedVectorizer = CountVectorizer(decode_error='replace',vocabulary=vocab)
    transformer = TfidfTransformer()

# define run function to execute the ml model
def run(raw_data):
    # init meta data
    init()

    # define y_hat (result) dictionary
    y_hat = dict()

    # transform feature to feature vector
    featureVector = trainedVectorizer.fit_transform([raw_data])
    featureVector_fit = transformer.fit_transform(featureVector)

    # save result into y_hat
    y_hat['prediction']  = model.predict(featureVector_fit).astype(dtype=float)[0]
    y_hat['probability'] = model.predict_proba(featureVector_fit).astype(dtype=float).tolist()

    # return JSON
    return(json.dumps(y_hat))


if __name__ == "__main__":
    test = "This is to check machine learning model"
    result = run(test)
    print("Data: {}\nResult: {}".format(test,result))

Overwriting score.py
