In [110]:
# importing all the required libraries
import numpy as np
import pandas as pd
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier as KNN

In [111]:
# reading the train.csv file
df=pd.read_csv("train.csv",quoting=csv.QUOTE_NONE, escapechar='\\')

In [112]:
df.shape

(2903024, 5)

In [113]:
df.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [114]:
# any null value present in dataset
df.isnull().values.any()

True

In [115]:
# check whether all the values are null, i.e., datadet is empty or not 
df.isnull().values.all()

False

In [116]:
# number of null vales in each column
df.isnull().sum()

TITLE                 71
DESCRIPTION       723664
BULLET_POINTS     166263
BRAND              56737
BROWSE_NODE_ID         0
dtype: int64

In [117]:
# total null values
df.isnull().sum().sum()

946735

In [118]:
df['TITLE'].fillna('NA', inplace=True)
df['DESCRIPTION'].fillna('NA', inplace=True)
df['BULLET_POINTS'].fillna('NA', inplace=True)
df['BRAND'].fillna('NA', inplace=True)

In [119]:
df.isnull().sum()

TITLE             0
DESCRIPTION       0
BULLET_POINTS     0
BRAND             0
BROWSE_NODE_ID    0
dtype: int64

In [120]:
# dropping all the duplicate rows
df.drop_duplicates()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4
...,...,...,...,...,...
2903019,Premium Aviator Sunglasses - HD Polarized (Bri...,These premium Aviator Sunglasses with 5 color ...,"[Frame size: Lens height - 56mm, Lens width - ...",Generic,1040
2903020,Social Distance Stickers - Set of 5 Sticker Sl...,set of 5 prints social distancing sticker self...,[covid19 safety sticker - set of 5 to maintain...,Generic,15199
2903021,Torr-to Face Shield PACK OF 5 with Adjustable ...,* COMPLETE FACE PROTECTION: Torr-to Face Shiel...,"[350 MICRONS PACK OF 5 PCS,COMPLETE FACE PROTE...",TORR-TO,1044933
2903022,Type-C to 3.5 MM for Oppo R17 Pro Type-C to 3....,Still want to use your favorite earphones/head...,"[Indian Connectors: Made for Indian sockets, t...",SHOPBELL,14790


In [121]:
df.shape

(2903024, 5)

In [122]:
# label_encoder object knows how to understand word labels
label_encoder = preprocessing.LabelEncoder()

# Encode labels in columns 
df['TITLE']= label_encoder.fit_transform(df['TITLE']) 
df['DESCRIPTION']= label_encoder.fit_transform(df['DESCRIPTION']) 
df['BULLET_POINTS']= label_encoder.fit_transform(df['BULLET_POINTS']) 
df['BRAND']= label_encoder.fit_transform(df['BRAND']) 
print(df.head())

     TITLE  DESCRIPTION  BULLET_POINTS   BRAND  BROWSE_NODE_ID
0  1806400       836541        1160339  138446               0
1  2367054      1031328         332780  215169               1
2  2369929       772454          40270  249855               2
3   208357       772454        1074717   14699               3
4  2345371       772454          30183  144536               4


In [123]:
X = df.drop(columns = ['BROWSE_NODE_ID'])
y = df['BROWSE_NODE_ID']

# separating the the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.00001, random_state = 2018)

# knn is the model
knn = KNN(n_neighbors = 3)
 
# train the model 'knn'
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [124]:
joblib.dump(knn, 'trained_model.joblib')

['trained_model.joblib']

In [125]:
# check whether the model is able to predict or not
print(joblib.load('trained_model.joblib').predict([[196577, 772454, 889445, 13827]]))

[1024]


In [126]:
# reading the test.csv file
testing =pd.read_csv("test.csv",quoting=csv.QUOTE_NONE, escapechar='\\')
testing.shape

(110775, 5)

In [127]:
# checking whether there is any null value or not 
testing.isnull().any()

PRODUCT_ID       False
TITLE             True
DESCRIPTION       True
BULLET_POINTS     True
BRAND             True
dtype: bool

In [128]:
# replacing all the null values by 'NA'
testing['TITLE'].fillna('NA', inplace=True)
testing['DESCRIPTION'].fillna('NA', inplace=True)
testing['BULLET_POINTS'].fillna('NA', inplace=True)
testing['BRAND'].fillna('NA', inplace=True)

# label_encoder object knows how to understand word labels
label_encoders = preprocessing.LabelEncoder()

# Encode labels in columns so that the format of training and testing data match
testing['TITLE']= label_encoders.fit_transform(testing['TITLE']) 
testing['DESCRIPTION']= label_encoders.fit_transform(testing['DESCRIPTION']) 
testing['BULLET_POINTS']= label_encoders.fit_transform(testing['BULLET_POINTS']) 
testing['BRAND']= label_encoders.fit_transform(testing['BRAND']) 
tests = testing.drop(columns = ['PRODUCT_ID'])

In [129]:
# check the accuracy of model
preds = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, preds)
print(accuracy)

0.6


In [130]:
# predictions is an array, which will store the output('BROWSE_NODE_ID') of the given test.csv file
predictions = knn.predict(tests)
print(predictions)

[ 165 3821  466 ...  413  896  379]


In [131]:
# Format predictions in DataFrame: prediction_df
prediction_df = pd.DataFrame(data=predictions)

# Save prediction_df to csv file
prediction_df.to_csv('submission_0.csv')