In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
initial_train_data = pd.read_json("~/SageMaker/data/dataset_en_train.json", lines=True)
initial_test_data = pd.read_json("~/SageMaker/data/dataset_en_test.json", lines=True)

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [3]:
initial_train_data.describe()

Unnamed: 0,stars
count,200000.0
mean,3.0
std,1.414217
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [4]:
initial_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   review_id         200000 non-null  object
 1   product_id        200000 non-null  object
 2   reviewer_id       200000 non-null  object
 3   stars             200000 non-null  int64 
 4   review_body       200000 non-null  object
 5   review_title      200000 non-null  object
 6   language          200000 non-null  object
 7   product_category  200000 non-null  object
dtypes: int64(1), object(7)
memory usage: 12.2+ MB


In [5]:
# There are no null values in the train data
initial_train_data.isnull().sum()

review_id           0
product_id          0
reviewer_id         0
stars               0
review_body         0
review_title        0
language            0
product_category    0
dtype: int64

In [6]:
# Finding out if there are any other languages other than english
initial_train_data['language'].nunique()
initial_test_data['language'].nunique()

1

In [10]:
# I am assuming that the product_id, review_id, reviewer_id, product and language are irrelevant for finding out the product
# So instead of dropping the columns I can keep only relevant data
initial_training_data = initial_train_data[['review_body','stars']]
initial_training_data.head()

Unnamed: 0,review_body,stars
0,Arrived broken. Manufacturer defect. Two of th...,1
1,the cabinet dot were all detached from backing...,1
2,I received my first order of this product and ...,1
3,This product is a piece of shit. Do not buy. D...,1
4,went through 3 in one day doesn't fit correct ...,1


In [11]:
# Same can be done with test data
initial_test_data = initial_test_data[['review_body','stars', 'review_title']]
initial_test_data['review_body'][1]

'I bought 4 and NONE of them worked. Yes I used new batteries!'

In [25]:
# Remove the product_category as it is the target item.
X = initial_train_data.drop(['stars'], axis=1)
y = initial_train_data['stars']

# Same for the test data as well.
X_test = initial_test_data.drop(['stars'], axis=1)
y_test = initial_test_data['stars']

https://towardsdatascience.com/1-to-5-star-ratings-classification-or-regression-b0462708a4df

In [26]:
# Now we have to use word vectorization 
count_vect = CountVectorizer()

In [27]:
X_train_counts = count_vect.fit_transform(X['review_body'])
X_train_counts.shape

(200000, 42677)

In [28]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(200000, 42677)

In [29]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train_tfidf, y)

In [30]:
X_test_counts = count_vect.transform(X_test['review_body'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape

(5000, 42677)

In [31]:
import pickle

file = open('model.pkl','wb')
pickle.dump(model, file)

In [32]:
predicted = model.predict(X_test_tfidf)

In [33]:
np.mean(predicted == y_test)

0.4766