In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [2]:
oversampled_df = pd.read_csv('../input/dataset/oversampled_preprocessed.csv')

# On oversampled data

In [3]:
oversampled_df.head()

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description
0,1523,Creative Spinach Blue Lion Fish Aquarium Toy S...,"[""Pet Supplies >> Toys >> Comfort Toys""]",Buy Creative Spinach Blue Lion Fish Aquarium T...,Creative Spinach,petsupplies,1,buy creative spinach blue lion fish aquarium t...
1,16016,Pawzone Round Stainless Steel Pet Bowl,"[""Pet Supplies >> Storage & Feeding Utensils >...",Specifications of Pawzone Round Stainless Stee...,Pawzone,petsupplies,1,specification pawzone round stainless steel pe...
2,16038,Pawzone Round Stainless Steel Pet Bowl,"[""Pet Supplies >> Storage & Feeding Utensils >...",Key Features of Pawzone Round Stainless Steel ...,Pawzone,petsupplies,1,key feature pawzone round stainless steel pet ...
3,15619,Royal Canin Maxi Starter 1kg Vegetable Dog Food,"[""Pet Supplies >> Food & Health Supplies >> Fo...",Specifications of Royal Canin Maxi Starter 1kg...,Royal Canin,petsupplies,1,specification royal canin maxi starter 1kg veg...
4,10214,Ocean Free Sponge Aquarium Filter,"[""Pet Supplies >> Habitat >> Habitat Essential...",Buy Ocean Free Sponge Aquarium Filter for Rs.1...,Ocean Free,petsupplies,1,buy ocean sponge aquarium filter rs 199 online...


In [4]:
oversampled_df['classification_category'].value_counts()

petsupplies            3000
jewellery              3000
footwear               3000
ebooks                 3000
housefurnishing        3000
babycare               3000
toys&schoolsupplies    3000
automotive             3000
clothing               3000
tools&hardware         3000
sports&fitness         3000
personalaccessories    3000
electronics            3000
Name: classification_category, dtype: int64

In [5]:
oversampled_df['corpus'] = oversampled_df['product_name'] + ' ' + oversampled_df['cleaned_description']

In [6]:
oversampled_df = oversampled_df.sample(frac = 1).reindex()

In [7]:
oversampled_df["classification_category"] = oversampled_df["classification_category"].astype('category')
oversampled_df['labels'] = oversampled_df.classification_category.cat.codes

In [8]:
oversampled_df.head(1)

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description,corpus,labels
18397,17012,Sentinel Mak Furige Model 07,"[""Toys & School Supplies >> Action Figures >> ...",Sentinel Mak Furige Model 07 (Multicolor) Pric...,Sentinel,toys&schoolsupplies,1,sentinel mak furige model 07 multicolor price ...,Sentinel Mak Furige Model 07 sentinel mak furi...,12


In [9]:
# splitting train and validation data
from sklearn.model_selection import train_test_split
over_train, over_valid = train_test_split(oversampled_df, train_size = 0.8, test_size = 0.2, random_state = 0)

In [10]:
print(over_train.shape, over_valid.shape)

(31200, 10) (7800, 10)


In [11]:
over_train_X = over_train.corpus
over_train_y = over_train.labels
over_valid_X = over_valid.corpus
over_valid_y = over_valid.labels

In [12]:
model=make_pipeline(TfidfVectorizer(),MultinomialNB())
model.fit(over_train_X,over_train_y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [13]:
predictions = model.predict(over_valid_X)

In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(over_valid_y, predictions))

0.9806410256410256


# On unbalanced data

In [15]:
df = pd.read_csv('../input/dataset/unbalanced_preprocessed.csv')

In [16]:
df = df[df['main_category'] == 1]
df['classification_category'].value_counts()

clothing               5503
jewellery              2946
housefurnishing        2307
personalaccessories    1535
electronics            1292
footwear               1123
automotive             1009
toys&schoolsupplies     626
tools&hardware          333
babycare                324
sports&fitness          166
petsupplies              30
ebooks                   15
Name: classification_category, dtype: int64

In [17]:
df['corpus'] = df['product_name'] + ' ' + df['cleaned_description']

In [18]:
df = df.sample(frac = 1).reindex()

In [19]:
df["classification_category"] = df["classification_category"].astype('category')
df['labels'] = df.classification_category.cat.codes

In [20]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description,corpus,labels
1446,1446,Parv Collections Showpiece - 12 cm,"[""Home Decor & Festive Needs >> Showpieces >> ...",Parv Collections Showpiece - 12 cm (Polyresi...,Parv Collections,housefurnishing,1,parv collection showpiece 12 cm polyresin mult...,Parv Collections Showpiece - 12 cm parv coll...,6


In [21]:
# splitting train and validation data
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 0)

In [22]:
train_X = train.corpus
train_y = train.labels
valid_X = valid.corpus
valid_y = valid.labels

In [23]:
model1=make_pipeline(TfidfVectorizer(),MultinomialNB())
model1.fit(train_X,train_y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [24]:
pred = model.predict(valid_X)

In [25]:
print(accuracy_score(valid_y, pred))

0.962521789657176
