# README

The aim of this project is to categorize text descriptions into 4 distinct categories. The dataset required for this project can be accessed at https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification?select=ecommerceDataset.csv. Multiple text classification algorithms are employed in this project to demonstrate their varying levels of accuracy.
- Bag of words with SVM
- Bag of words and Lemmatization with SVM
- N-grams with SVM
- TF-IDF with SVM

TF-IDF produces best results with 98% accuracy. Bag of words yields 96%. N-grams does not improve previous result, but takes much longer to train.

# Analyze data

In [1]:
# general modules
import pandas as pd
import os

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

# spacy
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# import csv file
df = pd.read_csv('ecommerceDataset.csv', header =None)
df.rename(columns = {0 : 'Label', 1: 'Text'}, inplace=True)

In [3]:
# determine number of unique labels
df['Label'].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: Label, dtype: int64

In [4]:
# visualize the first example
print(f"Label: {df['Label'][0]}")
print(f"Text: {df['Text'][0]}")

Label: Household
Text: Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and

In [5]:
# detect Null values
print(f"Number of Null values in the text is {df['Text'].isnull().sum()}")

# drop Null values
df.dropna(inplace=True)

# detect Null values
print(f"Number of Null values in the text is {df['Text'].isnull().sum()}")

Number of Null values in the text is 1
Number of Null values in the text is 0


In [8]:
# label encode y. y contains 4 categories.
LE = LabelEncoder()
y = LE.fit_transform(df['Label'])
print(y.shape)
print(y[0])

(50424,)
3


# Bag of words with SVM

In [7]:
%%time
# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_df=0.5)
# vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['Text'])

# dimensions of X matrix - bag of words
print(X.shape)

# dimensions of feature names
names = vectorizer.get_feature_names_out()
print(len(names))

(50424, 78571)
78571
CPU times: user 3.66 s, sys: 47 ms, total: 3.71 s
Wall time: 3.8 s


In [8]:
%%time
# divide into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# apply SVM to text classification
clf = svm.SVC()
clf.fit(X_train, y_train);

(40339, 78571)
(40339,)
(10085, 78571)
(10085,)
CPU times: user 9min 5s, sys: 1.31 s, total: 9min 6s
Wall time: 9min 7s


SVC()

In [9]:
# classification report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      2327
           1       0.97      0.97      0.97      1702
           2       0.98      0.93      0.95      2119
           3       0.96      0.96      0.96      3937

    accuracy                           0.96     10085
   macro avg       0.96      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085

Accuracy: 0.9565691621219633


# Bag of words + Lemmatization with SVM 

In [10]:
# create function to apply lemmatization and remove stop words
def lemmatize_stop_punct(text):
    
    string = ""
    for token in nlp(text):
        if not token.is_stop and token.pos_ == 'NOUN':
            string = string + token.lemma_ + " "

    return string.lower()

In [11]:
%%time
# apply function
filename = 'lemmatized_data.csv'

if not os.path.isfile(filename):
    df['Text_lemma'] = df['Text'].apply(lemmatize_stop_punct)
    df['Text_lemma'].to_csv('lemmatized_data.csv')
else:
    df['Text_lemma'].read_csv('lemmatized_data.csv')

# show table
df['Text_lemma']

CPU times: user 14min 49s, sys: 1.05 s, total: 14min 50s
Wall time: 14min 52s


0        x inch painting frame uv print effect attract ...
1        inch inch uv painting frame uv print effect at...
2        uv painting cm cm cm color | painting action s...
3        inch inch uv | cm painting action skill paint ...
4        x size size picture text rest life size plaque...
                               ...                        
50420    mobile life gadget picture music video lot spa...
50421                                                     
50422    w4 power phone pocket mb ram phone operating s...
50423    sm phone pocket price feature need trust glitc...
50424                                                 win 
Name: Text_lemma, Length: 50424, dtype: object

In [12]:
%%time
# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_df=0.5)
# vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['Text_lemma'])

# dimensions of X matrix - bag of words
print(X.shape)

# dimensions of feature names
names = vectorizer.get_feature_names_out()
print(len(names))

(50424, 24070)
24070
CPU times: user 1.32 s, sys: 0 ns, total: 1.32 s
Wall time: 1.35 s


In [13]:
%%time
# divide into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# apply SVM to text classification
clf = svm.SVC()
clf.fit(X_train, y_train);

(40339, 24070)
(40339,)
(10085, 24070)
(10085,)
CPU times: user 3min 30s, sys: 622 ms, total: 3min 30s
Wall time: 3min 31s


SVC()

In [14]:
# classification report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.95      0.80      2327
           1       0.96      0.78      0.86      1702
           2       0.96      0.79      0.87      2119
           3       0.91      0.86      0.88      3937

    accuracy                           0.85     10085
   macro avg       0.88      0.84      0.85     10085
weighted avg       0.88      0.85      0.86     10085

Accuracy: 0.852156668319286


# N-grams with SVM

In [18]:
%%time
# Create a CountVectorizer object
vectorizer = CountVectorizer(ngram_range=(1,3), lowercase=True, stop_words='english', max_df=0.5)
# vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['Text'])

# dimensions of X matrix - n-gram_range=(1,3)
print(X.shape)

# dimensions of feature names
names = vectorizer.get_feature_names_out()
print(len(names))

(50424, 2345589)
2345589
CPU times: user 17.5 s, sys: 492 ms, total: 18 s
Wall time: 17.9 s


In [19]:
%%time
# divide into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# apply SVM to text classification
clf = svm.SVC()
clf.fit(X_train, y_train);

(40339, 2345589)
(40339,)
(10085, 2345589)
(10085,)
CPU times: user 28min 26s, sys: 3.56 s, total: 28min 30s
Wall time: 28min 34s


SVC()

In [20]:
# classification report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93      2327
           1       0.98      0.96      0.97      1702
           2       0.98      0.93      0.95      2119
           3       0.96      0.96      0.96      3937

    accuracy                           0.95     10085
   macro avg       0.96      0.95      0.95     10085
weighted avg       0.96      0.95      0.95     10085

Accuracy: 0.954486861675756


# TF-IDF with SVM

In [11]:
%%time
# Create a TFIDF object
tfidf = TfidfVectorizer(max_features=10000, lowercase=True)

X = tfidf.fit_transform(df['Text'])
print(X.shape)

(50424, 30000)
CPU times: total: 406 ms
Wall time: 3.15 s


In [12]:
%%time
# divide into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# apply SVM to text classification
clf = svm.SVC()
clf.fit(X_train, y_train);

(40339, 30000)
(40339,)
(10085, 30000)
(10085,)
CPU times: total: 44.6 s
Wall time: 12min 4s


In [14]:
# classification report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2327
           1       0.98      0.98      0.98      1702
           2       0.98      0.96      0.97      2119
           3       0.97      0.99      0.98      3937

    accuracy                           0.98     10085
   macro avg       0.98      0.98      0.98     10085
weighted avg       0.98      0.98      0.98     10085

Accuracy: 0.9768963807635102
