In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2


In [9]:
oversampled_df = pd.read_csv('../input/dataset/oversampled_preprocessed.csv')

# On oversampled data

In [10]:
oversampled_df.head()

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description
0,1523,Creative Spinach Blue Lion Fish Aquarium Toy S...,"[""Pet Supplies >> Toys >> Comfort Toys""]",Buy Creative Spinach Blue Lion Fish Aquarium T...,Creative Spinach,petsupplies,1,buy creative spinach blue lion fish aquarium t...
1,16016,Pawzone Round Stainless Steel Pet Bowl,"[""Pet Supplies >> Storage & Feeding Utensils >...",Specifications of Pawzone Round Stainless Stee...,Pawzone,petsupplies,1,specification pawzone round stainless steel pe...
2,16038,Pawzone Round Stainless Steel Pet Bowl,"[""Pet Supplies >> Storage & Feeding Utensils >...",Key Features of Pawzone Round Stainless Steel ...,Pawzone,petsupplies,1,key feature pawzone round stainless steel pet ...
3,15619,Royal Canin Maxi Starter 1kg Vegetable Dog Food,"[""Pet Supplies >> Food & Health Supplies >> Fo...",Specifications of Royal Canin Maxi Starter 1kg...,Royal Canin,petsupplies,1,specification royal canin maxi starter 1kg veg...
4,10214,Ocean Free Sponge Aquarium Filter,"[""Pet Supplies >> Habitat >> Habitat Essential...",Buy Ocean Free Sponge Aquarium Filter for Rs.1...,Ocean Free,petsupplies,1,buy ocean sponge aquarium filter rs 199 online...


In [11]:
oversampled_df['classification_category'].value_counts()

petsupplies            3000
jewellery              3000
footwear               3000
ebooks                 3000
housefurnishing        3000
babycare               3000
toys&schoolsupplies    3000
automotive             3000
clothing               3000
tools&hardware         3000
sports&fitness         3000
personalaccessories    3000
electronics            3000
Name: classification_category, dtype: int64

In [12]:
oversampled_df['corpus'] = oversampled_df['product_name'] + ' ' + oversampled_df['cleaned_description']

In [13]:
oversampled_df = oversampled_df.sample(frac = 1).reindex()

In [14]:
oversampled_df["classification_category"] = oversampled_df["classification_category"].astype('category')
oversampled_df['labels'] = oversampled_df.classification_category.cat.codes

In [15]:
oversampled_df.head(1)

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description,corpus,labels
7663,13375,Catwalk Women Wedges,"[""Footwear >> Women's Footwear >> Wedges""]",Catwalk Women Wedges\n ...,,footwear,1,catwalk woman wedge price 2 995 feel energy wa...,Catwalk Women Wedges catwalk woman wedge price...,5


In [16]:
# splitting train and validation data
from sklearn.model_selection import train_test_split
over_train, over_valid = train_test_split(oversampled_df, train_size = 0.8, test_size = 0.2, random_state = 0)

In [17]:
print(over_train.shape, over_valid.shape)

(31200, 10) (7800, 10)


In [18]:
over_train_X = over_train.corpus
over_train_y = over_train.labels
over_valid_X = over_valid.corpus
over_valid_y = over_valid.labels

In [19]:


pipeline = Pipeline([('tfidf',TfidfVectorizer()),
                     ('chi',  SelectKBest(chi2, k=100)),
                     ('clf', RandomForestClassifier())])



In [22]:
model=pipeline.fit(over_train_X,over_train_y)

In [23]:
predictions=model.predict(over_valid_X)

In [24]:
from sklearn.metrics import accuracy_score
print(accuracy_score(over_valid_y, predictions))

0.911025641025641


# On unbalanced data

In [25]:
df = pd.read_csv('../input/dataset/unbalanced_preprocessed.csv')

In [26]:
df = df[df['main_category'] == 1]
df['classification_category'].value_counts()

clothing               5503
jewellery              2946
housefurnishing        2307
personalaccessories    1535
electronics            1292
footwear               1123
automotive             1009
toys&schoolsupplies     626
tools&hardware          333
babycare                324
sports&fitness          166
petsupplies              30
ebooks                   15
Name: classification_category, dtype: int64

In [27]:
df['corpus'] = df['product_name'] + ' ' + df['cleaned_description']

In [28]:
df = df.sample(frac = 1).reindex()

In [29]:
df["classification_category"] = df["classification_category"].astype('category')
df['labels'] = df.classification_category.cat.codes

In [30]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,product_name,product_category_tree,description,brand,classification_category,main_category,cleaned_description,corpus,labels
1944,1944,Vivity Comfortable Women's Plunge Bra,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Vivity Comfortable Women's Plunge Bra - Buy Or...,Orange,clothing,1,vivity comfortable woman s plunge bra buy oran...,Vivity Comfortable Women's Plunge Bra vivity c...,2


In [31]:
# splitting train and validation data
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 0)

In [32]:
train_X = train.corpus
train_y = train.labels
valid_X = valid.corpus
valid_y = valid.labels

In [33]:
pipeline1 = Pipeline([('tfidf',TfidfVectorizer()),
                     ('chi',  SelectKBest(chi2, k=100)),
                     ('clf', RandomForestClassifier())])
model1=pipeline1.fit(train_X,train_y)

In [34]:
pred = model1.predict(valid_X)

In [35]:
print(accuracy_score(valid_y, pred))

0.868390470656595
