## E-Commerce website - product categorization

In [2]:
import pandas as pd #for data manipulation

df = pd.read_csv("/content/drive/MyDrive/4.Data Science/Extra Learning/Data Science - Codebasics/9. Natural Language Processing (NLP)/18. Text Representation Using TF-IDF/Ecommerce product sale/Ecommerce_data.csv")
print(df.shape)
df.head(5)

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [3]:
df.label.value_counts() #to check whether the dataset is balanced

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

Therefore, this is a balanced dataset.

In [5]:
df['label_num'] = df.label.map({
    'Household' : 0,
    'Books': 1,
    'Electronics': 2,
    'Clothing & Accessories': 3
}) # mapping labels with numbers. Because ML understands only numbers

df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [9]:
#splitting train and test datasets

from sklearn.model_selection import train_test_split

X = df.Text
y = df.label_num

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2024,stratify=df.label_num)


In [11]:
print(X_train.shape)
print(X_test.shape)

(19200,)
(4800,)


In [12]:
y_train.value_counts()

3    4800
2    4800
1    4800
0    4800
Name: label_num, dtype: int64

In [13]:
y_test.value_counts()

1    1200
0    1200
2    1200
3    1200
Name: label_num, dtype: int64

In [14]:
# using KNN algorithm to train a ML model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1200
           1       0.97      0.95      0.96      1200
           2       0.96      0.97      0.96      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [15]:
X_test[:5]

1814                                 Microsoft Excel 2010 
9790     Decals Design 'Tree with Birds and Cages' Wall...
1367     Tizum TMGOG Portable EVA Universal Electronic ...
16717        Allen Solly Men's Polo Allen Solly Men's Polo
8120     The Heartfulness Way: Heart-Based Meditations ...
Name: Text, dtype: object

In [16]:
y_test[:5]

1814     1
9790     0
1367     2
16717    3
8120     1
Name: label_num, dtype: int64

In [17]:
y_pred[:5]

array([1, 0, 2, 3, 1])

In [18]:
# using naive bayes algorithm to train a ML model

from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Multi_nb',MultinomialNB())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1200
           1       0.98      0.92      0.95      1200
           2       0.96      0.97      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [19]:
# using random forest algorithm to train a ML model

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('random_forest',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1200
           1       0.97      0.97      0.97      1200
           2       0.99      0.96      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



We could identify that Random forest performs better than other 2 algorithms.

In [20]:
#pre processing of dataset

import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  #removing stop words and lemmatization
  doc = nlp(text)
  filtered_tokens=[]
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)


In [22]:
df['preprocessed_text'] = df['Text'].apply(preprocess)

In [23]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3,Indira Designer Women Art Mysore Silk Saree Bl...


In [24]:
#splitting train and test datasets

from sklearn.model_selection import train_test_split

X = df.preprocessed_text
y = df.label_num

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2024,stratify=df.label_num)

In [25]:
# using random forest algorithm to train a ML model

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('random_forest',RandomForestClassifier())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.97      0.98      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.98      0.97      0.98      4800
weighted avg       0.98      0.97      0.98      4800



Now the accuracy is slightly better.