# Item Classification Problem

In [1]:
import pandas as pd
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import cPickle
from sklearn.svm import LinearSVC
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [2]:
# load training data
data = pd.read_csv("bmv_training_set.csv")

In [3]:
# extract keys from attributes key-value pairs and store them in a new column
data['attribute_list']=[[re.split("[=:=]", str(i))[0] for i in attributes] for attributes
                        in data['additionalAttributes'].str.split(";")]

data['attributes_new'] = data.apply(lambda row : "|".join(str(code).lower() for code in list(set(row['attribute_list'])) 
                                                          if any(j.isdigit() for j in code)==False 
                                                          and "{" not in str(code) 
                                                          and "}" not in str(code)
                                                          and "rank" not in str(code).lower()
                                                          and "shipping" not in str(code).lower()
                                                          and "dimensions" not in str(code).lower()
                                                          and "weight" not in str(code).lower()
                                                          and "review" not in str(code).lower()
                                                          and "sold" not in str(code).lower()
                                                          and "warranty" not in str(code).lower()
                                                          and "gender" not in str(code).lower()
                                                          and "no." not in str(code).lower()
                                                          and "etc." not in str(code).lower()
                                                          and "misc." not in str(code).lower()
                                                         ), axis = 1)

In [4]:
# converting keys of 3 categories(books, music, videos) into dummy columns 
df1 = pd.DataFrame(data.loc[data['label'].isin(['books','music','videos']),'attributes_new'])
main_categories = df1['attributes_new'].str.get_dummies()

In [5]:
# selecting only those keys from "rest" cateogry which are present in any of the above extracted keys. 
# This would also reduce the number of features

def get_list(row):
    return "|".join(str(i) for i in row['attributes_new'].split("|") if i in list(main_categories))


df_rest = pd.DataFrame(data.loc[data['label'].isin(['rest']),'attributes_new'])
df_rest["attributes_new"] = df_rest.apply(lambda row: get_list(row), axis = 1)
df_rest = df_rest['attributes_new'].str.get_dummies()

In [6]:
# creating train and output labels for training data
train = pd.concat([main_categories,df_rest])
train = train.sort_index()
train = train.fillna(0)
output_labels = pd.DataFrame(data.iloc[:,-3])

In [7]:
# storing these data frame for refernce
train.to_csv("train.csv", index = False)
output_labels.to_csv("output_labels.csv", index = False)

In [8]:
# apply cross validation to tune hyperparameters

def get_best_param(train,output_labels):
    C_range = 10.0 ** np.arange(-2, 1)
    parameters = {'C':C_range, 'class_weight':['balanced']}
    cv = StratifiedKFold(n_splits=5)
    grid = GridSearchCV(LinearSVC(), parameters)
    grid.fit(train, output_labels['label'])
    print("The best classifier is: ", grid.best_estimator_)
    
get_best_param(train, output_labels)

('The best classifier is: ', LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))


In [9]:
# training the model and store it into pickle file

clf = LinearSVC(random_state=0, class_weight = 'balanced', C=1.0)
clf.fit(train, output_labels['label'])
with open('classifier_rf.pkl', 'wb') as fid:
    cPickle.dump(clf, fid)

In [10]:
# getting confusion matrix for training data

labels = list(set(output_labels['label']))  
labels.sort()
y_pred = clf.predict(train)
cm = metrics.confusion_matrix(output_labels, y_pred,labels)
print labels
print cm

['books', 'music', 'rest', 'videos']
[[ 23023   4752    144      5]
 [     2  22996    122    204]
 [    64     58 225244     10]
 [     2   4552    347  18475]]


In [11]:
# insample accuracy
accuracy_score(output_labels, y_pred)

0.9657933333333333

In [12]:
# loading test file and preparing data for test

test_data_file = pd.read_csv("bmv_test_set.csv", header = [0])
test_data_file_backup = test_data_file

test_data_file['attribute_list']=[[re.split("[=:=]", str(i))[0] for i in attributes] for attributes
                    in test_data_file['additionalAttributes'].str.split(";")]
test_data_file['attributes_new'] = test_data_file.apply(lambda row : "|".join(str(code).lower() for code in list(set(row['attribute_list']))
                                                                              if code in list(main_categories)), axis = 1)
test_data_file = pd.concat([test_data_file,pd.DataFrame(index=test_data_file.index, columns=list(main_categories))],
                           axis=1)

In [13]:
# getting values for dummy variable and running classification model on the same

def get_dummy_values(row):
    if len(row['attributes_new']) == 0:
        return
    for i in row['attributes_new'].split("|"):
        test_data_file.loc[(test_data_file['id'] == row['id']),i] = 1

test_data_file.apply(lambda row : get_dummy_values(row), axis = 1)

test_data_file.fillna(0, inplace = True)
test_data_file = test_data_file.drop(['id','additionalAttributes','attribute_list','attributes_new'], axis = 1)
test_pred = clf.predict(test_data_file)

In [14]:
test_data_file_backup = test_data_file_backup.join(pd.DataFrame(test_pred).rename(columns = {0:'label'}))
test_data_file_backup.drop(['attribute_list','attributes_new'], axis = 1, inplace = True)
test_data_file_backup.to_csv("submission.csv")