In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

df = pd.read_csv("./ecommerceDataset.csv", header=None)
df.columns = ['label', 'text']
df.head()

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

cols = df.select_dtypes(include=['object'])
for col in cols.columns.values:
    df[col] = df[col].fillna('')

df["label"].loc[df["label"]=="Household"]=0.0
df["label"].loc[df["label"]=="Books"]=1.0
df["label"].loc[df["label"]=="Electronics"]=2.0
df["label"].loc[df["label"]=="Clothing & Accessories"]=3.0

i = np.random.rand(len(df)) < 0.8
train = df[i]
test = df[~i]
print("train data size: ", train.shape)
print("test data size: ", test.shape)

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(df.text).batch(128)
vectorizer.adapt(text_ds)

train data size:  (22241, 2)
test data size:  (5561, 2)


In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
from tensorflow.keras import layers

EMBEDDING_DIM = 128
MAX_SEQUENCE_LENGTH = 200

embedding_layer = layers.Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH)

In [None]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(4, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_5 (Embedding)     (None, None, 128)         2560128   
                                                                 
 conv1d_15 (Conv1D)          (None, None, 128)         82048     
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, None, 128)        0         
 g1D)                                                            
                                                                 
 conv1d_16 (Conv1D)          (None, None, 128)         82048     
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, None, 128)        0         
 g1D)                                                      

In [None]:
x_train = vectorizer(np.array([[s] for s in train.text])).numpy()

y_train = np.array(train.label, dtype=np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_train = np.array(train.label, dtype=np.float)


In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fdd66c551c0>

In [None]:
from sklearn.metrics import *

test_x = vectorizer(np.array([[s] for s in test.text])).numpy()

preds = model.predict(test_x)
pred_labels = [np.argmax(p) for p in preds]

y_test = np.array(test.label, dtype=np.float)

print('accuracy score: ', accuracy_score(y_test, pred_labels))
print(classification_report(y_test, pred_labels))

accuracy score:  0.767128214349937
              precision    recall  f1-score   support

         0.0       0.69      0.97      0.81      2091
         1.0       0.83      0.93      0.88      1231
         2.0       0.00      0.00      0.00      1120
         3.0       0.88      0.97      0.92      1119

    accuracy                           0.77      5561
   macro avg       0.60      0.72      0.65      5561
weighted avg       0.62      0.77      0.68      5561



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_test = np.array(test.label, dtype=np.float)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
