In [None]:
import pandas as pd
import csv


In [None]:
train = pd.read_csv('train.csv', escapechar = '\\', quoting = csv.QUOTE_NONE)

In [None]:
train.head(3)

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train = train.dropna()  
train.head(3)

In [None]:
len(train.BROWSE_NODE_ID.value_counts())

In [None]:
train.drop_duplicates(inplace = True)

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
remove_special_char = re.compile('[/(){}\[\]\|@,;]')
remove_extra_symbols = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    text = text.lower()
    text = remove_special_char.sub(' ', text)
    text = remove_extra_symbols.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
train['TITLE'] = train['TITLE'].apply(clean_text)
train['BULLET_POINTS'] = train['BULLET_POINTS'].apply(clean_text)
train['DESCRIPTION'] = train['DESCRIPTION'].apply(clean_text)
train['BRAND'] = train['BRAND'].apply(clean_text)

In [None]:
train.info(memory_usage = 'deep')

In [None]:
import numpy as np
X = train['DESCRIPTION'] + train['BULLET_POINTS'] + train['TITLE'] + train['BRAND']
y = train['BROWSE_NODE_ID'].astype(np.uint16)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb = Pipeline([
     ('tfid', TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),
     ('clf', MultinomialNB(alpha=.01)),
     ])
nb.fit(X_train, y_train)

In [None]:

y_pred = nb.predict((X_test))

In [None]:
print(accuracy_score(y_pred,y_test))

In [None]:
print(nb.predict(['MenS Full Sleeve shirt']))

In [None]:
test = pd.read_csv('test.csv', escapechar = '\\', quoting = csv.QUOTE_NONE)

In [None]:
test['TITLE']=test['TITLE'].apply(str)
test['DESCRIPTION']=test['DESCRIPTION'].apply(str)
test['BULLET_POINTS']=test['BULLET_POINTS'].apply(str)
test['BRAND']=test['BRAND'].apply(str)

x_testing = test['TITLE'] + test['DESCRIPTION'] + test['BULLET_POINTS']+ test['BRAND']
y_pred_test = nb.predict((x_testing))
print(y_pred_test)

In [None]:
results = pd.DataFrame({
    "PRODUCT_ID": test['PRODUCT_ID'],
    "BROWSE_NODE_ID": y_pred_test
})

In [None]:
results.to_csv('Submission.csv', index = False)