In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=["Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
        "Apple is announcing new iphone tomorrow",
        "Tesla is announcing new model-3 tommorrow",
        "Google is announcing new pixel-6 tomorrow",
        "Microsoft is announcing new surface tomorrow",
        "Amazon is announcing new eco-dot tomorrow",
        "I am eating biryani and you are eating grapes"
       ]

In [3]:
#Let's create the vectorizer and fit the corpus and transform them according
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [4]:
#let's print the vocabulary
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 27, 'tesla': 24, 'model': 19, 'tommorrow': 26, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 28, 'are': 6, 'grapes': 13}


In [5]:
#let's print the idf of each word:
all_feature_names = v.get_feature_names_out()
for word in all_feature_names:
    #let's get the index in the vocabulary 
    indx = v.vocabulary_.get(word)
    #get the score
    idf_score = v.idf_[indx]
    print(f"{word} : {idf_score}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tommorrow : 2.386294361119891
tomorrow : 1.4700036292457357
you : 2.386294361119891


# get the dataset from kaggle

In [6]:
import pandas as pd
#read the data into a pandas dataframe
df = pd.read_csv("C:/8-text minning/text_mining/Ecommerce_data.csv")
print(df.shape)
df.head(5)

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [7]:
#check the distribution of labels
df['label'].value_counts()
#check the data balance or not by value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [8]:
#hence all data of labels occur equal number of time hence data is balance

In [9]:
df['label_num'] = df['label'].map({'Household' : 0,
                                   'Books' : 1,
                                   'Electronics' : 2,
                                   'Clothing & Accessories' : 3})
#checking the results
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,#20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num)

In [12]:
print("Shape of X_train:",X_train.shape)
print("Shape of X_test:",X_test.shape)

Shape of X_train: (19200,)
Shape of X_test: (4800,)


In [13]:
X_train.head()

15820    IRIS Furniture Children Deluxe Spiderman Toddl...
23224    Godox CB-09 Hard Carrying Storage Suitcase Car...
4638     Ugreen All in 1 USB 3.0 Card Reader USB Memory...
15245    Spread Spain Metallic Gold Bar Trolley/Kitchen...
5378     Chromozome Men's Calf Socks (Pack of 3) (SX-3 ...
Name: Text, dtype: object

In [14]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
#1. create a pipeline object
clf = Pipeline([('vectorizer_tfidf',TfidfVectorizer()),
                ('KNF',KNeighborsClassifier())])
#2.fix with X_train and y_train
clf.fit(X_train, y_train)

#3.get the prediction for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [17]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
6621     Milton Marvel Insulated Steel Casseroles, Juni...
Name: Text, dtype: object

In [18]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label_num, dtype: int64

In [19]:
y_pred[:5]

array([0, 2, 3, 1, 0], dtype=int64)