## TF -IDF (Term Frequency - Inverse Document Frequency)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "Somthing is amazing?",
    "amazon is a big company and it is a good company",
    "google is announcing a new product google pixel 9"
]


In [6]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)


{'this': 20, 'is': 10, 'the': 18, 'first': 7, 'document': 6, 'second': 16, 'and': 2, 'third': 19, 'one': 13, 'somthing': 17, 'amazing': 0, 'amazon': 1, 'big': 4, 'company': 5, 'it': 11, 'good': 8, 'google': 9, 'announcing': 3, 'new': 12, 'product': 15, 'pixel': 14}


In [7]:
v.get_feature_names_out()

array(['amazing', 'amazon', 'and', 'announcing', 'big', 'company',
       'document', 'first', 'good', 'google', 'is', 'it', 'new', 'one',
       'pixel', 'product', 'second', 'somthing', 'the', 'third', 'this'],
      dtype=object)

In [9]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    print(f"{word} {v.idf_[indx]}")

amazing 2.386294361119891
amazon 2.386294361119891
and 1.9808292530117262
announcing 2.386294361119891
big 2.386294361119891
company 2.386294361119891
document 1.6931471805599454
first 1.9808292530117262
good 2.386294361119891
google 2.386294361119891
is 1.0
it 2.386294361119891
new 2.386294361119891
one 2.386294361119891
pixel 2.386294361119891
product 2.386294361119891
second 2.386294361119891
somthing 2.386294361119891
the 1.4700036292457357
third 2.386294361119891
this 1.4700036292457357


In [10]:
corpus[:2]

['This is the first document.', 'This document is the second document.']

In [11]:
transformed_output.toarray()[:2]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.48649932, 0.56916026, 0.        , 0.        ,
        0.28733434, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42238252, 0.        ,
        0.42238252],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.71416002, 0.        , 0.        , 0.        ,
        0.21089721, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.50326281, 0.        , 0.31001966, 0.        ,
        0.31001966]])

In [12]:
import pandas as pd 

df = pd.read_csv("Ecommerce_data.csv")

print(df.shape)
df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [13]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [15]:
df['label_num'] = df.label.map({
    'Household': 0,
    'Books': 1,
    'Electronics': 2,
    'Clothing & Accessories': 3
})

df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,
    random_state= 2022,
    stratify=df.label_num
)

In [18]:
print("shape of X_train: ", X_train.shape)
print("shape of X_test: ", X_test.shape)

shape of X_train:  (19200,)
shape of X_test:  (4800,)


In [19]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [20]:
y_test.value_counts()

label_num
0    1200
2    1200
3    1200
1    1200
Name: count, dtype: int64

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('Vectorizer', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [26]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
6621     Milton Marvel Insulated Steel Casseroles, Juni...
Name: Text, dtype: object

In [23]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label_num, dtype: int64

In [25]:
pred[:5]

array([0, 2, 3, 1, 0])

In [27]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('Vectorizer', TfidfVectorizer()),
    ('NB', MultinomialNB())
])

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1200
           1       0.98      0.92      0.95      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.99      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [28]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('Vectorizer', TfidfVectorizer()),
    ('RF', RandomForestClassifier())
])

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.98      0.98      0.98      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.99      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [30]:
### utlity function for pre-processing the text
from pydoc import doc
from matplotlib import text
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [31]:
df['preproccessed_txt'] = df['Text'].apply(preprocess)


In [32]:
df.head()

Unnamed: 0,Text,label,label_num,preproccessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner Low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI RAID Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3,Indira Designer woman Art Mysore Silk Saree Bl...


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,
    random_state= 2022,
    stratify=df.label_num
)

In [34]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('Vectorizer', TfidfVectorizer()),
    ('RF', RandomForestClassifier())
])

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.98      0.98      0.98      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.99      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800

