In [2]:
dataset='/kaggle/input/ecomercedataset/ecommerceDataset.csv'

In [3]:
import pandas as pd

data = pd.read_csv(dataset, header=None, names=['Label', 'Description'])

class_counts = data['Label'].value_counts()
print("Class counts:\n", class_counts)


Class counts:
 Label
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64


In [4]:
print (data.head(5))


       Label                                        Description
0  Household  Paper Plane Design Framed Wall Hanging Motivat...
1  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...
2  Household  SAF 'UV Textured Modern Art Print Framed' Pain...
3  Household  SAF Flower Print Framed Painting (Synthetic, 1...
4  Household  Incredible Gifts India Wooden Happy Birthday U...


In [5]:
# Data Cleaning
import re
import nltk
from nltk.corpus import stopwords

In [6]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def clean_text(text):
    # Check if text is a string
    if not isinstance(text, str):
        return ""
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the clean_text function
data['Description'] = data['Description'].apply(clean_text)

In [8]:
from sklearn.model_selection import train_test_split

# Perform a stratified split (70-30)
train_data, test_data = train_test_split(
    data, test_size=0.3, stratify=data['Label'], random_state=42
)

# Separate texts and labels
train_texts = train_data['Description'].values
train_labels = train_data['Label'].values
test_texts = test_data['Description'].values
test_labels = test_data['Label'].values

In [9]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_labels)
test_labels = encoder.transform(test_labels)

# Convert to categorical format
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [9]:
# 1 TextVectorization with one-gram multi_hot encoding

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.optimizers import Adam

vectorizer = TextVectorization(max_tokens=10000, output_mode='multi_hot', ngrams=1)
vectorizer.adapt(train_texts)

model = Sequential([
    vectorizer,
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model with 10% validation split
history = model.fit(train_texts, train_labels, epochs=5, validation_split=0.1)

Epoch 1/5
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8692 - loss: 0.4647 - val_accuracy: 0.9578 - val_loss: 0.1696
Epoch 2/5
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9840 - loss: 0.0679 - val_accuracy: 0.9640 - val_loss: 0.1549
Epoch 3/5
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9938 - loss: 0.0277 - val_accuracy: 0.9649 - val_loss: 0.1713
Epoch 4/5
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9970 - loss: 0.0153 - val_accuracy: 0.9694 - val_loss: 0.1779
Epoch 5/5
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9977 - loss: 0.0107 - val_accuracy: 0.9640 - val_loss: 0.1999


In [10]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
              precision    recall  f1-score      support
Class 1        0.987995  0.996368  0.992164   826.000000
Class 2        1.000000  0.993232  0.996604   591.000000
Class 3        0.993532  0.994819  0.994175   772.000000
Class 4        0.995512  0.992543  0.994025  1341.000000
accuracy       0.994051  0.994051  0.994051     0.994051
macro avg      0.994260  0.994240  0.994242  3530.000000
weighted avg   0.994072  0.994051  0.994054  3530.000000


In [11]:
# 2. TextVectorization with two-gram multi_hot encoding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.optimizers import Adam


vectorizer = TextVectorization(max_tokens=10000, output_mode='multi_hot', ngrams=2)
vectorizer.adapt(train_texts)

model = Sequential([
    vectorizer,
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])


model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_texts, train_labels, epochs=6, validation_split=0.1)

Epoch 1/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8871 - loss: 0.4882 - val_accuracy: 0.9669 - val_loss: 0.1300
Epoch 2/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9827 - loss: 0.0712 - val_accuracy: 0.9704 - val_loss: 0.1087
Epoch 3/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9922 - loss: 0.0308 - val_accuracy: 0.9748 - val_loss: 0.1156
Epoch 4/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9962 - loss: 0.0162 - val_accuracy: 0.9745 - val_loss: 0.1288
Epoch 5/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9962 - loss: 0.0130 - val_accuracy: 0.9751 - val_loss: 0.1431
Epoch 6/6
[1m894/894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9977 - loss: 0.0096 - val_accuracy: 0.9748 - val_loss: 0.1420


In [12]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
              precision    recall  f1-score      support
Class 1        0.995896  0.994536  0.995215   732.000000
Class 2        0.998172  0.998172  0.998172   547.000000
Class 3        0.997006  0.995516  0.996260   669.000000
Class 4        0.996751  0.998373  0.997561  1229.000000
accuracy       0.996852  0.996852  0.996852     0.996852
macro avg      0.996956  0.996649  0.996802  3177.000000
weighted avg   0.996852  0.996852  0.996852  3177.000000


In [13]:
#3. with two-gram tf_idf encoding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.optimizers import Adam

vectorizer = TextVectorization(max_tokens=10000, output_mode='tf_idf', ngrams=2)
vectorizer.adapt(train_texts)


model = Sequential([
    vectorizer,
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


history = model.fit(train_texts, train_labels, epochs=5, validation_split=0.1)

Epoch 1/5
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.8515 - loss: 0.6661 - val_accuracy: 0.9619 - val_loss: 0.2129
Epoch 2/5
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9800 - loss: 0.0951 - val_accuracy: 0.9671 - val_loss: 0.1884
Epoch 3/5
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9890 - loss: 0.0526 - val_accuracy: 0.9689 - val_loss: 0.2421
Epoch 4/5
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9932 - loss: 0.0312 - val_accuracy: 0.9678 - val_loss: 0.2091
Epoch 5/5
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9936 - loss: 0.0251 - val_accuracy: 0.9717 - val_loss: 0.2211


In [14]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
              precision    recall  f1-score      support
Class 1        0.993701  0.996840  0.995268   633.000000
Class 2        1.000000  1.000000  1.000000   463.000000
Class 3        0.992212  0.990669  0.991440   643.000000
Class 4        0.994638  0.993750  0.994194  1120.000000
accuracy       0.994753  0.994753  0.994753     0.994753
macro avg      0.995138  0.995315  0.995225  2859.000000
weighted avg   0.994753  0.994753  0.994753  2859.000000


In [15]:
# 4. with max length 200, max tokens = 10000, and output mode = 'int'

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, TextVectorization
from tensorflow.keras.optimizers import Adam

vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=200
)

vectorizer.adapt(train_texts)

model = Sequential([
    vectorizer,
    Embedding(input_dim=10000, output_dim=32),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])


model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


history = model.fit(train_texts, train_labels, epochs=8, validation_split=0.1)

Epoch 1/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.4243 - loss: 1.2872 - val_accuracy: 0.5136 - val_loss: 1.0915
Epoch 2/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.5324 - loss: 1.0832 - val_accuracy: 0.3986 - val_loss: 1.3154
Epoch 3/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.4070 - loss: 1.3045 - val_accuracy: 0.5451 - val_loss: 1.0417
Epoch 4/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.6208 - loss: 0.9182 - val_accuracy: 0.7339 - val_loss: 0.6597
Epoch 5/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.7986 - loss: 0.5724 - val_accuracy: 0.8815 - val_loss: 0.4167
Epoch 6/8
[1m724/724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9233 - loss: 0.2961 - val_accuracy: 0.9324 - val_loss: 0.2827
Epoch 7/8
[1m724/724[0m 

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)

[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
              precision    recall  f1-score      support
Class 1        0.968852  0.970443  0.969647   609.000000
Class 2        0.974239  0.951945  0.962963   437.000000
Class 3        0.879004  0.974359  0.924228   507.000000
Class 4        0.986667  0.942214  0.963928  1021.000000
accuracy       0.956876  0.956876  0.956876     0.956876
macro avg      0.952190  0.959740  0.955192  2574.000000
weighted avg   0.959136  0.956876  0.957298  2574.000000


In [18]:
# 5. using GloVe
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, TextVectorization
from tensorflow.keras.optimizers import Adam

vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=200
)

vectorizer.adapt(train_texts)

def load_glove_embeddings(glove_file, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file = '/kaggle/input/glovedataset/glove.6B.100d.txt'
embedding_dim = 100

embeddings_index = load_glove_embeddings(glove_file, embedding_dim)

embedding_matrix = np.zeros((10000, embedding_dim))
for i in range(10000):
    word = vectorizer.get_vocabulary()[i]
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

model = Sequential([
    vectorizer,
    Embedding(input_dim=10000, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=200, trainable=False),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_texts, train_labels, epochs=5, validation_split=0.1)

Epoch 1/5




[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5038 - loss: 1.1608 - val_accuracy: 0.5898 - val_loss: 0.9516
Epoch 2/5
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.5950 - loss: 0.9889 - val_accuracy: 0.6274 - val_loss: 0.8823
Epoch 3/5
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6161 - loss: 0.9415 - val_accuracy: 0.3985 - val_loss: 1.3119
Epoch 4/5
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.4145 - loss: 1.2933 - val_accuracy: 0.4629 - val_loss: 1.2099
Epoch 5/5
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4774 - loss: 1.1747 - val_accuracy: 0.4028 - val_loss: 1.2908


In [19]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
              precision    recall  f1-score      support
Class 1        0.754386  0.078755  0.142620   546.000000
Class 2        0.562500  0.042353  0.078775   425.000000
Class 3        0.596774  0.078390  0.138577   472.000000
Class 4        0.392610  0.973654  0.559579   873.000000
accuracy       0.409326  0.409326  0.409326     0.409326
macro avg      0.576567  0.293288  0.229888  2316.000000
weighted avg   0.550683  0.409326  0.287250  2316.000000


In [None]:
 #6 with FastText
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, TextVectorization
from tensorflow.keras.optimizers import Adam

vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=200
)

vectorizer.adapt(train_texts)

def load_fasttext_embeddings(fasttext_file, embedding_dim=100):
    embeddings_index = {}
    with open(fasttext_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

fasttext_file = '/kaggle/input/wikinews300d1msubwordvec/wiki-news-300d-1M-subword.vec'
embedding_dim = 300

embeddings_index = load_fasttext_embeddings(fasttext_file, embedding_dim)

embedding_matrix = np.zeros((10000, embedding_dim))
for i in range(10000):
    word = vectorizer.get_vocabulary()[i]
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

model = Sequential([
    vectorizer,
    Embedding(input_dim=10000, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=200, trainable=False),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_texts, train_labels, epochs=5, validation_split=0.1)


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

val_predictions = model.predict(val_texts)

# Convert predictions to the class with the highest probability
val_predictions = val_predictions.argmax(axis=1)

# Get the true labels from the validation set (convert from one-hot encoding to class indices)
val_true_labels = val_labels.argmax(axis=1)

report = classification_report(val_true_labels, val_predictions, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)


report_df = pd.DataFrame(report).transpose()

print(report_df)
