# Multi class Amazon product reviews classification using machine learning

In [1]:
# Import necessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

%matplotlib inline

In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')
# this creates a symbolic link so that now the path /content/gdrive/My\ Drive/ is equal to /mydrive
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive/multi_class_text_classification

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
ln: failed to create symbolic link '/mydrive/My Drive': File exists
'Copy of delete now.ipynb'
 deep_learning_multi_class_text_classification.ipynb
'delete now.ipynb'
 machine_learning_multi_class_text_classification.ipynb
 text_cls
 Transformer_multi_class_text_classification.ipynb


In [3]:
# Install spacy en_core_web_lg 
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
# Read data
df = pd.read_csv('/content/gdrive/MyDrive/multi_class_text_classification/text_cls/train_40k.csv')[['Text','Cat1']]
df.head()

Unnamed: 0,Text,Cat1
0,The description and photo on this product need...,grocery gourmet food
1,This was a great book!!!! It is well thought t...,toys games
2,"I am a first year teacher, teaching 5th grade....",toys games
3,I got the book at my bookfair at school lookin...,toys games
4,Hi! I'm Martine Redman and I created this puzz...,toys games


In [5]:
df['Text'][1]

'This was a great book!!!! It is well thought through, and you can easily imagine the events happening. The Westing Game itself is a great way to tie two things together. The events are well sequenced and exciting. Ellen Raskin wrote a wonderful book'

In [6]:
df.shape

(40000, 2)

In [7]:
# Drop duplicates
df.drop_duplicates(inplace=True)
df.shape

(39512, 2)

In [8]:
df['Cat1'].value_counts()

toys games              10246
health personal care     9524
beauty                   5722
baby products            5602
pet supplies             4820
grocery gourmet food     3598
Name: Cat1, dtype: int64

In [9]:
# Balancing all classes
df = df.groupby('Cat1').sample(n=3598)
df.shape

(21588, 2)

In [10]:
# load en_core_web_lg
import spacy
nlp = spacy.load('en_core_web_lg')

In [11]:
def tokenizer(sentence):
  """
  Tokenize the sentence and remove stop words and punctuation 
  """
  doc = nlp(sentence)

  tokens = []
  for token in doc:
      if token.lemma_ != "-PRON-":
          temp = token.lemma_.lower().strip()
      else:
          temp = token.lower_
      tokens.append(temp)

  cleaned_tokens = []
  for token in tokens:
      if token not in STOP_WORDS and token not in punctuation:
          cleaned_tokens.append(token)
  return cleaned_tokens    

In [12]:
tokenizer('This was a great book!!!! It is well thought through, and you can easily imagine the events happening. The Westing Game itself is a great way to tie two things together. The events are well sequenced and exciting. Ellen Raskin wrote a wonderful book')

['great',
 'book',
 'think',
 'easily',
 'imagine',
 'event',
 'happen',
 'westing',
 'game',
 'great',
 'way',
 'tie',
 'thing',
 'event',
 'sequenced',
 'exciting',
 'ellen',
 'raskin',
 'write',
 'wonderful',
 'book']

In [13]:
# Train test split of data
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Cat1'], test_size = 0.2, random_state=300)
X_train.shape, X_test.shape

((17270,), (4318,))

## Model training

In [14]:
# Initialize the ML models
tfidf = TfidfVectorizer(tokenizer = tokenizer, max_features= 20000)
svc_classifier = SVC()
nb_classifier = MultinomialNB()
xgb_classifier = XGBClassifier()

In [15]:
# tfidf transform
X_train_vector=tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [16]:
svc_classifier.fit(X_train_vector,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
nb_classifier.fit(X_train_vector,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
xgb_classifier.fit(X_train_vector,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## Model evaluation

In [19]:
# Prediction on test values
y_pred_svc = svc_classifier.predict(X_test_vector)
y_pred_nb = nb_classifier.predict(X_test_vector)
y_pred_xgb = xgb_classifier.predict(X_test_vector)

In [20]:
print(classification_report(y_test, y_pred_svc))

                      precision    recall  f1-score   support

       baby products       0.89      0.84      0.86       680
              beauty       0.84      0.85      0.85       714
grocery gourmet food       0.88      0.90      0.89       769
health personal care       0.69      0.79      0.73       727
        pet supplies       0.94      0.85      0.89       691
          toys games       0.88      0.86      0.87       737

            accuracy                           0.85      4318
           macro avg       0.85      0.85      0.85      4318
        weighted avg       0.85      0.85      0.85      4318



In [21]:
print(classification_report(y_test, y_pred_nb))

                      precision    recall  f1-score   support

       baby products       0.75      0.90      0.82       680
              beauty       0.81      0.88      0.85       714
grocery gourmet food       0.89      0.90      0.90       769
health personal care       0.84      0.62      0.71       727
        pet supplies       0.88      0.88      0.88       691
          toys games       0.86      0.85      0.86       737

            accuracy                           0.84      4318
           macro avg       0.84      0.84      0.84      4318
        weighted avg       0.84      0.84      0.84      4318



In [22]:
print(classification_report(y_test, y_pred_xgb))

                      precision    recall  f1-score   support

       baby products       0.82      0.74      0.78       680
              beauty       0.82      0.74      0.78       714
grocery gourmet food       0.84      0.76      0.80       769
health personal care       0.48      0.73      0.58       727
        pet supplies       0.91      0.77      0.84       691
          toys games       0.82      0.75      0.79       737

            accuracy                           0.75      4318
           macro avg       0.78      0.75      0.76      4318
        weighted avg       0.78      0.75      0.76      4318



In [23]:
confusion_matrix(y_test, y_pred_svc)

array([[570,  15,   5,  50,   7,  33],
       [  2, 608,  16,  77,   2,   9],
       [  5,  11, 690,  46,   7,  10],
       [ 17,  60,  53, 572,  10,  15],
       [  7,  22,  10,  46, 586,  20],
       [ 37,   7,   7,  42,  11, 633]])

In [24]:
confusion_matrix(y_test, y_pred_nb)

array([[615,  15,   2,   6,   8,  34],
       [ 22, 627,   7,  38,   9,  11],
       [ 12,  15, 693,  22,  13,  14],
       [ 74,  88,  66, 450,  29,  20],
       [ 29,  17,   8,  11, 605,  21],
       [ 70,   8,   2,   6,  21, 630]])

In [25]:
confusion_matrix(y_test, y_pred_xgb)

array([[506,  13,   9,  93,   8,  51],
       [ 10, 527,  15, 150,   3,   9],
       [  9,   8, 588, 134,  11,  19],
       [ 29,  67,  65, 529,  17,  20],
       [  9,  16,  13,  96, 533,  24],
       [ 51,  13,  10,  96,  11, 556]])

##Hyper Parameter tunning

In [28]:
# Hyper Parameter tunning of SVC
from sklearn.model_selection import GridSearchCV

hp = {'C':[1,10,100],
         'gamma':[0.001,0.01,0.1,1.0,10],
         'kernel':['rbf','linear']
        }

gd = GridSearchCV(estimator=svc_classifier,param_grid=hp,verbose=True, n_jobs=-1)
gd.fit(X_train_vector,y_train)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 93.9min finished


0.83329473074696
SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [31]:
tuned_svc_classifier = SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
tuned_svc_classifier.fit(X_train_vector,y_train)
y_pred_tuned_svc = tuned_svc_classifier.predict(X_test_vector)

In [32]:
print(classification_report(y_test, y_pred_tuned_svc))

                      precision    recall  f1-score   support

       baby products       0.87      0.85      0.86       680
              beauty       0.84      0.86      0.85       714
grocery gourmet food       0.88      0.90      0.89       769
health personal care       0.72      0.77      0.75       727
        pet supplies       0.93      0.86      0.89       691
          toys games       0.89      0.86      0.87       737

            accuracy                           0.85      4318
           macro avg       0.85      0.85      0.85      4318
        weighted avg       0.85      0.85      0.85      4318



In [33]:
confusion_matrix(y_test, y_pred_tuned_svc)

array([[578,  13,   7,  43,   7,  32],
       [  6, 615,  15,  67,   2,   9],
       [  6,  11, 691,  42,  10,   9],
       [ 22,  66,  51, 561,  14,  13],
       [  8,  20,  12,  38, 594,  19],
       [ 44,   8,   9,  28,  13, 635]])