In [1]:
try:
    import json
    import os
    
    import pandas as  pd
    import spacy
    
    import seaborn as sns
    import string

    from tqdm import tqdm
    from textblob import TextBlob
    
    from nltk.corpus import stopwords
    import nltk
    from nltk.stem import WordNetLemmatizer
    from nltk import word_tokenize
    import re
    
    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline
    
    
    from sklearn.preprocessing import FunctionTransformer
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.pipeline import FeatureUnion
    from sklearn.feature_extraction import DictVectorizer
    
    import swifter
    
    tqdm.pandas()
except Exception as e:
    print("Error : {} ".format(e))

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("avaerage_perception_tagger")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maashree\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maashree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Error loading avaerage_perception_tagger: Package
[nltk_data]     'avaerage_perception_tagger' not found in index
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maashree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# Data Exploration

In [4]:
import pandas as pd
train_df = pd.read_csv("training_data.csv")
train_df.head()

Unnamed: 0,title,category
0,The Three Amigos,
1,Home Essentials Blue Floral Glass Vintage Styl...,Home & Kitchen
2,Cooper Wiring Quiet Toggle Switch Single Pole ...,Tools & Home Improvement
3,Baseboarders&reg; Wall Brackets,Tools & Home Improvement
4,The Great Wave Off Kanagawa Custom Rectangle M...,Office Products


In [5]:
train_df.isna().sum()

title       0
category    0
dtype: int64

In [6]:
train_df["category"].unique()

array(['None', 'Home & Kitchen', 'Tools & Home Improvement',
       'Office Products', 'Grocery & Gourmet Food', 'Electronics',
       'Industrial & Scientific'], dtype=object)

In [15]:
train_df["category"].value_counts()

None                        10123
Home & Kitchen               4960
Tools & Home Improvement     2080
Office Products              1144
Grocery & Gourmet Food       1102
Industrial & Scientific       588
Electronics                   191
Name: category, dtype: int64

In [16]:
train_df.describe()

Unnamed: 0,title,category
count,20188,20188.0
unique,20188,7.0
top,The Three Amigos,
freq,1,10123.0


In [None]:
pip 

In [17]:
## Preprocessing

In [18]:
#Remove puntuation
#lemmitization
#making lowercase

In [30]:
stop_words = set(stopwords.words("english"))
wnlem = WordNetLemmatizer()
my_words = []
  
def clean_txt(text):
    clean_text1 = []
    clean_text2 = []
    text = re.sub("'", "",text)
    text=re.sub("(\\d|\\W)+"," ",text)    
    clean_text1 = [ wnlem.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text1 if black_txt(word)]
    return " ".join(clean_text2)

def black_txt(token):
    return  token not in stop_words and token not in list(string.punctuation)  and len(token)>2 and token not in my_words


In [31]:
def subj_txt(text):
    return  TextBlob(text).sentiment[1]

def polarity_txt(text):
    return TextBlob(text).sentiment[0]

def len_txt(text):
    if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
    else:
         return 0

In [32]:
train_df["text"] = train_df["title"]
train_df["text"] = train_df["text"].swifter.apply(clean_txt)
train_df["polarity"] = train_df["text"].swifter.apply(polarity_txt)
train_df["subjectivity"] = train_df["text"].swifter.apply(subj_txt)
train_df["len"] = train_df["text"].swifter.apply(lambda x: len(x))

Pandas Apply:   0%|          | 0/20188 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20188 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20188 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20188 [00:00<?, ?it/s]

In [34]:
#Label encoding

X = train_df[["text", "polarity", "subjectivity","len"]]
y =train_df["category"]

encoder = LabelEncoder()
y = encoder.fit_transform(y)

y

array([4, 2, 6, ..., 2, 4, 6])

In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [36]:
v = dict(zip(list(y), train_df["category"].to_list()))
v

{4: 'None',
 2: 'Home & Kitchen',
 6: 'Tools & Home Improvement',
 5: 'Office Products',
 1: 'Grocery & Gourmet Food',
 0: 'Electronics',
 3: 'Industrial & Scientific'}

In [40]:
text_clf = Pipeline([
    ("vector",CountVectorizer(analyzer="word",stop_words="english")),
    ("tfidf",TfidfTransformer(use_idf=True)),
    ("mnb",MultinomialNB(alpha=.01)),
])

In [41]:
text_clf.fit(x_train["text"].to_list(),list(y_train))

Pipeline(steps=[('vector', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('mnb', MultinomialNB(alpha=0.01))])

In [42]:
import numpy as np

X_TEST = x_test['text'].to_list()
Y_TEST = list(y_test)

In [43]:
predicted = text_clf.predict(X_TEST)

In [44]:
c = 0

for doc, category in zip(X_TEST, predicted):
    
    if c == 2:break
    
    print("-"*55)
    print(doc)
    print(v[category])
    print("-"*55)

    c = c + 1 

-------------------------------------------------------
designer stencil mini jack lantern face cookie stencil beige semi transparent
None
-------------------------------------------------------
-------------------------------------------------------
zartsy hand paint abstract landscape red black silver passion color artwork home wall decor art oil paint canvas stretch wood frame large oversized
None
-------------------------------------------------------


In [45]:
# Accuracy

np.mean(predicted == Y_TEST)

0.4725111441307578

In [79]:
title = ["Love sign cookie cutter language hand"]

In [80]:
predicted = text_clf.predict(title)

In [81]:
v[predicted[0]]

'Home & Kitchen'

In [82]:
test_df=pd.read_csv("testing_data.csv")
test_df.head()

Unnamed: 0,title,category
0,Pom Pom Hair Band Rabbit Light Grey Decorative...,
1,Mariposa Golf Ball Napkin Weight,
2,Mediterranean Snack Food Roasted Garlic Hummuz...,
3,John Deere 0071750GX22269 Genuine Original Equ...,
4,"Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...",


In [83]:
title = test_df["title"]

0        Pom Pom Hair Band Rabbit Light Grey Decorative...
1                         Mariposa Golf Ball Napkin Weight
2        Mediterranean Snack Food Roasted Garlic Hummuz...
3        John Deere 0071750GX22269 Genuine Original Equ...
4        Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...
                               ...                        
10089    Flexible Coupling - 3.0&quot; x 3.0&quot; - White
10090    Mr. Christmas 4 1/2-Inch Mini Porcelain Music ...
10091    Jewish New Year - Shanah Tovah - Rosh Hashana ...
10092    Hotel Collection Gridwork Queen Bedskirt Graphite
10093    2-LBS SUPER FRESH ROASTED &ndash; YIRGACHEFFE ...
Name: title, Length: 10094, dtype: object

In [98]:
predicted = text_clf.predict(title)
predicted

array([2, 4, 4, ..., 2, 2, 4])

In [95]:
v[predicted[500]]

'None'

In [119]:
zipvar=pd.DataFrame(zip(title,predicted))
zipvar.columns=["title","category"]
zipvar

Unnamed: 0,title,category
0,Pom Pom Hair Band Rabbit Light Grey Decorative...,2
1,Mariposa Golf Ball Napkin Weight,4
2,Mediterranean Snack Food Roasted Garlic Hummuz...,4
3,John Deere 0071750GX22269 Genuine Original Equ...,4
4,"Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...",4
...,...,...
10089,Flexible Coupling - 3.0&quot; x 3.0&quot; - White,4
10090,Mr. Christmas 4 1/2-Inch Mini Porcelain Music ...,4
10091,Jewish New Year - Shanah Tovah - Rosh Hashana ...,2
10092,Hotel Collection Gridwork Queen Bedskirt Graphite,2


In [122]:
num_to_cat={4: 'None',
 2: 'Home & Kitchen',
 6: 'Tools & Home Improvement',
 5: 'Office Products',
 1: 'Grocery & Gourmet Food',
 0: 'Electronics',
 3: 'Industrial & Scientific'}

In [123]:
zipvar.category=zipvar.category.replace(num_to_cat)

In [127]:
zipvar

Unnamed: 0,title,category
0,Pom Pom Hair Band Rabbit Light Grey Decorative...,Home & Kitchen
1,Mariposa Golf Ball Napkin Weight,
2,Mediterranean Snack Food Roasted Garlic Hummuz...,
3,John Deere 0071750GX22269 Genuine Original Equ...,
4,"Protech Wood Cleanr Gl,PERFORMANCE COATINGS IN...",
...,...,...
10089,Flexible Coupling - 3.0&quot; x 3.0&quot; - White,
10090,Mr. Christmas 4 1/2-Inch Mini Porcelain Music ...,
10091,Jewish New Year - Shanah Tovah - Rosh Hashana ...,Home & Kitchen
10092,Hotel Collection Gridwork Queen Bedskirt Graphite,Home & Kitchen


In [128]:
zipvar.to_csv("submission_test_data.csv")