In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new echo-dot tomorrow",
    "I am eating biriyani and you are eating grapes",
    "something is amazing"
]

In [2]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)

{'thor': 27, 'eating': 11, 'pizza': 23, 'loki': 18, 'is': 17, 'ironman': 16, 'ate': 8, 'already': 0, 'apple': 6, 'announcing': 5, 'new': 21, 'iphone': 15, 'tomorrow': 28, 'tesla': 26, 'model': 20, 'google': 13, 'pixel': 22, 'microsoft': 19, 'surface': 25, 'amazon': 3, 'echo': 12, 'dot': 10, 'am': 1, 'biriyani': 9, 'and': 4, 'you': 29, 'are': 7, 'grapes': 14, 'something': 24, 'amazing': 2}


In [5]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    index = v.vocabulary_.get(word)
    print(f"{word} {v.idf_[index]}")

already 2.504077396776274
am 2.504077396776274
amazing 2.504077396776274
amazon 2.504077396776274
and 2.504077396776274
announcing 1.4054651081081644
apple 2.504077396776274
are 2.504077396776274
ate 2.504077396776274
biriyani 2.504077396776274
dot 2.504077396776274
eating 2.09861228866811
echo 2.504077396776274
google 2.504077396776274
grapes 2.504077396776274
iphone 2.504077396776274
ironman 2.504077396776274
is 1.1177830356563834
loki 2.504077396776274
microsoft 2.504077396776274
model 2.504077396776274
new 1.4054651081081644
pixel 2.504077396776274
pizza 2.504077396776274
something 2.504077396776274
surface 2.504077396776274
tesla 2.504077396776274
thor 2.504077396776274
tomorrow 1.4054651081081644
you 2.504077396776274


In [6]:
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [7]:
transformed_output.toarray()[:2]

array([[0.24247317, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24247317, 0.        ,
        0.        , 0.40642288, 0.        , 0.        , 0.        ,
        0.        , 0.24247317, 0.10823643, 0.24247317, 0.        ,
        0.        , 0.        , 0.        , 0.7274195 , 0.        ,
        0.        , 0.        , 0.24247317, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.31652498, 0.5639436 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5639436 , 0.        , 0.25173606, 0.        , 0.        ,
        0.        , 0.31652498, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.31652498, 0.        ]])

In [34]:
import pandas as pd
import numpy as np

df = pd.read_csv("ecommerceDataset.csv", names=["label", "text"])
df.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [35]:
df.dropna(inplace=True)

In [36]:
df.label.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: label, dtype: int64

In [37]:
min_samples = 8670

df_household = df[df.label=="Household"].sample(min_samples, random_state=42)
df_books = df[df.label=="Books"].sample(min_samples, random_state=42)
df_electronics = df[df.label=="Electronics"].sample(min_samples, random_state=42)
df_clothing = df[df.label=="Clothing & Accessories"].sample(min_samples, random_state=42)

In [38]:
df_balanced = pd.concat([df_household, df_books, df_electronics, df_clothing], axis=0)
df_balanced.label.value_counts()

Household                 8670
Books                     8670
Electronics               8670
Clothing & Accessories    8670
Name: label, dtype: int64

In [39]:
df_balanced["label_num"] = df_balanced.label.map({
    "Household" : 0,
    "Books" : 1,
    "Electronics" : 2,
    "Clothing & Accessories" : 3
})

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["text"],
    df_balanced["label_num"],
    test_size=0.2,
    stratify=df_balanced["label_num"]    
)

In [41]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27744,), (6936,), (27744,), (6936,))

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [50]:
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1734
           1       0.97      0.96      0.96      1734
           2       0.97      0.95      0.96      1734
           3       0.97      0.97      0.97      1734

    accuracy                           0.96      6936
   macro avg       0.96      0.96      0.96      6936
weighted avg       0.96      0.96      0.96      6936



In [51]:
X_test.iloc[:5]

33318    QRAFTINK Socks for Women, Girls | Premium Merc...
10353    SRBI 12 Inch Stainless Steel Homebrew Thermome...
49547    MOUNT TRACK 9106 Nylon 80L Backpack with Rain ...
19080    Black & Decker G720R 4-Inch/100mm 820-Watt Ang...
36765    Shocknshop Girl's Cotton Bralett (AMZN-25__Bla...
Name: text, dtype: object

In [52]:
y_test.iloc[:5]

33318    3
10353    0
49547    2
19080    0
36765    3
Name: label_num, dtype: int64

In [53]:
y_pred[:5]

array([3, 0, 2, 0, 3], dtype=int64)

In [54]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      1734
           1       0.97      0.93      0.95      1734
           2       0.95      0.94      0.95      1734
           3       0.98      0.98      0.98      1734

    accuracy                           0.95      6936
   macro avg       0.95      0.95      0.95      6936
weighted avg       0.95      0.95      0.95      6936



In [55]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('random forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1734
           1       0.96      0.97      0.97      1734
           2       0.97      0.95      0.96      1734
           3       0.98      0.98      0.98      1734

    accuracy                           0.96      6936
   macro avg       0.96      0.96      0.96      6936
weighted avg       0.96      0.96      0.96      6936



In [56]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [57]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [58]:
df_balanced['preprocessed_text'] = df_balanced["text"].apply(preprocess_text)

In [59]:
df_balanced.head()

Unnamed: 0,label,text,label_num,preprocessed_text
12340,Household,"Riedel VINUM Cognac Glasses, Set of 2 Size:Set...",0,Riedel VINUM Cognac Glasses set 2 size set 2 ...
7144,Household,ANSIO Polyester Shower Curtain for Bathroom - ...,0,ANSIO Polyester Shower Curtain Bathroom Solid ...
4400,Household,Little India Meenakari Work Lord Ganesh Marble...,0,little India Meenakari Work Lord Ganesh Marble...
9992,Household,PETRICE Triple Paper Dispenser | 4 in 1 Foil C...,0,petrice Triple Paper Dispenser | 4 1 foil clin...
16601,Household,KC Cab Copper PVC Insulated Wire 90 m Coil(Red...,0,KC Cab Copper PVC Insulated Wire 90 m Coil(Red...


In [60]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["preprocessed_text"],
    df_balanced["label_num"],
    test_size=0.2,
    stratify=df_balanced["label_num"]    
)

In [61]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1734
           1       0.97      0.93      0.95      1734
           2       0.94      0.95      0.95      1734
           3       0.97      0.98      0.97      1734

    accuracy                           0.95      6936
   macro avg       0.95      0.95      0.95      6936
weighted avg       0.95      0.95      0.95      6936



In [62]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('random forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1734
           1       0.98      0.97      0.98      1734
           2       0.96      0.96      0.96      1734
           3       0.98      0.98      0.98      1734

    accuracy                           0.97      6936
   macro avg       0.97      0.97      0.97      6936
weighted avg       0.97      0.97      0.97      6936

