# Adopt a Buddy
# Hackerearth competition

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import date

# Overview of Training Dataset

In [2]:
train_dataset = pd.read_csv('./Dataset/train.csv')
train_dataset

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.80,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.90,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.50,11.06,18,4,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
18829,ANSL_51738,2017-01-26 00:00:00,2018-03-09 15:35:00,2.0,Tricolor,0.44,27.36,0,1,0.0,2
18830,ANSL_59900,2016-06-18 00:00:00,2017-07-09 08:37:00,,Brown,0.73,14.25,15,4,2.0,4
18831,ANSL_53210,2010-07-21 00:00:00,2018-08-22 14:27:00,0.0,Calico Point,0.99,28.13,13,9,1.0,1
18832,ANSL_63468,2017-05-12 00:00:00,2018-02-08 14:05:00,0.0,Tan,0.55,44.82,13,9,1.0,2


In [4]:
pet_category = {'0':0,'1':0,'2':0,'3':0,'4':0}
breed_category = {'0':0,'1':0,'2':0}

for f in range(0,len(train_dataset)):
    if (train_dataset['breed_category'][f] == 0.0):
        breed_category['0'] +=1
    if (train_dataset['breed_category'][f] == 1.0):
        breed_category['1'] +=1
    if (train_dataset['breed_category'][f] == 2.0):
        breed_category['2'] +=1
    
    if (train_dataset['pet_category'][f] == 0):
        pet_category['0'] +=1
    if (train_dataset['pet_category'][f] == 1):
        pet_category['1'] +=1
    if (train_dataset['pet_category'][f] == 2):
        pet_category['2'] +=1
    if (train_dataset['pet_category'][f] == 3):
        pet_category['3'] +=1
    if (train_dataset['pet_category'][f] == 4):
        pet_category['4'] +=1
        

print('breed category distribution \t', ' 0 : ',breed_category['0'],'\t1 : ', breed_category['1'], '\t2 : ',breed_category['2'])

print('pet category distribution \t', ' 0 : ',pet_category['0'],'\t1 : ', pet_category['1'], '\t2 : ',pet_category['2'],
      '\t3 :',pet_category['3'],'\t4 : ',pet_category['4'])


breed category distribution 	  0 :  9000 	1 :  8357 	2 :  1477
pet category distribution 	  0 :  88 	1 :  7184 	2 :  10621 	3 : 0 	4 :  941


# Separate training data into Input and Output features 

In [3]:
# model to predict pet category
# Extact input features as a np array
in_features = ['condition', 'color_type', 'length(m)', 'height(cm)','X1','X2']
x = np.array(train_dataset[in_features])
print("Input feature dataset\n", x)

out_feature1 = ['pet_category']
y1 = np.array(train_dataset[out_feature1])
print("Pet category\n",y1)

out_feature2 = ['breed_category']
y2 = np.array(train_dataset[out_feature2])
print("Breed Category\n", y2)

Input feature dataset
 [[2.0 'Brown Tabby' 0.8 7.78 13 9]
 [1.0 'White' 0.72 14.19 13 9]
 [nan 'Brown' 0.15 40.9 15 4]
 ...
 [0.0 'Calico Point' 0.99 28.13 13 9]
 [0.0 'Tan' 0.55 44.82 13 9]
 [0.0 'Brown' 0.86 37.4 0 1]]
Pet category
 [[1]
 [2]
 [4]
 ...
 [1]
 [2]
 [2]]
Breed Category
 [[0.]
 [0.]
 [2.]
 ...
 [1.]
 [1.]
 [1.]]


# Data pre-processing



In [4]:
# Replace the color_type with integer value,
# 
color_type = []

for f in range(0, len(train_dataset)):
    if not( train_dataset['color_type'][f] in color_type):
        color_type.append(train_dataset['color_type'][f])

In [49]:
#print(len(color_type))
#color_type

In [5]:
for i in range(0, len(color_type)):
    ar1 = np.where(x[:, 1] == color_type[i])
    for j in range(0, len(ar1[0])):
        x[ar1[0][j], 1] = i
print(x)

[[2.0 0 0.8 7.78 13 9]
 [1.0 1 0.72 14.19 13 9]
 [nan 2 0.15 40.9 15 4]
 ...
 [0.0 25 0.99 28.13 13 9]
 [0.0 16 0.55 44.82 13 9]
 [0.0 2 0.86 37.4 0 1]]


# Training using XGBClassifer 

In [14]:
# Train xgb classifier for pet category
model1 = xgb.XGBClassifier()
model1.fit(x, y1)

model2 = xgb.XGBClassifier()
model2.fit(x, y2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Read test dataset file

In [6]:
test_data = pd.read_csv("./Dataset/test.csv")
test_data

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0.71,30.92,0,7
...,...,...,...,...,...,...,...,...,...
8067,ANSL_66809,2016-02-10 00:00:00,2017-03-10 14:56:00,2.0,Brown,0.82,36.08,13,9
8068,ANSL_59041,2015-12-07 00:00:00,2018-02-12 00:00:00,0.0,Tan,0.49,27.54,13,9
8069,ANSL_60034,2015-12-08 00:00:00,2017-01-04 17:19:00,0.0,Black,0.98,37.19,0,7
8070,ANSL_58066,2016-06-28 00:00:00,2017-07-20 18:19:00,,Black,0.79,23.83,0,2


In [8]:
# data pre-processing
x_test = np.array(test_data[in_features])
x_test

array([[0.0, 'Black', 0.87, 42.73, 0, 7],
       [1.0, 'Orange Tabby', 0.06, 6.71, 0, 1],
       [1.0, 'Black', 0.24, 41.21, 0, 7],
       ...,
       [0.0, 'Black', 0.98, 37.19, 0, 7],
       [nan, 'Black', 0.79, 23.83, 0, 2],
       [0.0, 'Black', 0.64, 24.51, 0, 1]], dtype=object)

In [9]:
for i in range(0, len(color_type)):
    ar1 = np.where(x_test[:, 1] == color_type[i])
    for j in range(0, len(ar1[0])):
        x_test[ar1[0][j], 1] = i

print(x_test)

[[0.0 3 0.87 42.73 0 7]
 [1.0 14 0.06 6.71 0 1]
 [1.0 3 0.24 41.21 0 7]
 ...
 [0.0 3 0.98 37.19 0 7]
 [nan 3 0.79 23.83 0 2]
 [0.0 3 0.64 24.51 0 1]]


# Prediction using trained model 

In [15]:
y1_pred = model1.predict(x_test)
y2_pred = model2.predict(x_test)

In [16]:
print(y1_pred)
print(y2_pred)

[2 1 2 ... 2 1 2]
[1. 0. 0. ... 1. 2. 1.]


# Output file 

In [14]:
PetId = np.array(test_data['pet_id'])
op = {'pet_id': PetId, 'breed_category': y2_pred , 'pet_category':y1_pred} 
df = pd.DataFrame(op)
df.to_csv('output.csv', index = False)

In [15]:
my_op = pd.read_csv("./output.csv")
my_op

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1.0,2
1,ANSL_76663,0.0,1
2,ANSL_58259,0.0,2
3,ANSL_67171,0.0,1
4,ANSL_72871,0.0,2
...,...,...,...
8067,ANSL_66809,0.0,2
8068,ANSL_59041,1.0,2
8069,ANSL_60034,1.0,2
8070,ANSL_58066,2.0,1


# Part - 2

# Use breed category data for prediction of pet category and vice versa

In [10]:
new_if = ['condition', 'color_type', 'length(m)', 'height(cm)','X1','X2', 'breed_category']
x_new = np.array(train_dataset[new_if])

color_type = []
for f in range(0, len(train_dataset)):
    if not( train_dataset['color_type'][f] in color_type):
        color_type.append(train_dataset['color_type'][f])
        
for i in range(0, len(color_type)):
    ar1 = np.where(x_new[:, 1] == color_type[i])
    for j in range(0, len(ar1[0])):
        x_new[ar1[0][j], 1] = i

In [11]:
x_new

array([[2.0, 0, 0.8, ..., 13, 9, 0.0],
       [1.0, 1, 0.72, ..., 13, 9, 0.0],
       [nan, 2, 0.15, ..., 15, 4, 2.0],
       ...,
       [0.0, 25, 0.99, ..., 13, 9, 1.0],
       [0.0, 16, 0.55, ..., 13, 9, 1.0],
       [0.0, 2, 0.86, ..., 0, 1, 1.0]], dtype=object)

In [12]:
model3 = xgb.XGBClassifier()
model3.fit(x_new, y1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:

op1 = {'condition':x_test[:, 0], 'color_type':x_test[:, 1], 'length(m)':x_test[:, 2], 'height(cm)':x_test[:, 3],'X1':x_test[:, 4],'X2':x_test[:, 5], 'breed_category': y2_pred} 
df1
x3_test_new = np.array(df1[new_if])
x3_test_new

array([[ 0.  ,  3.  ,  0.87, ...,  0.  ,  7.  ,  1.  ],
       [ 1.  , 14.  ,  0.06, ...,  0.  ,  1.  ,  0.  ],
       [ 1.  ,  3.  ,  0.24, ...,  0.  ,  7.  ,  0.  ],
       ...,
       [ 0.  ,  3.  ,  0.98, ...,  0.  ,  7.  ,  1.  ],
       [  nan,  3.  ,  0.79, ...,  0.  ,  2.  ,  2.  ],
       [ 0.  ,  3.  ,  0.64, ...,  0.  ,  1.  ,  1.  ]])

In [70]:
np.shape(x3_test_new)

(8072, 7)

In [71]:
y3_pred = model3.predict(x3_test_new)

In [72]:
y3_pred

array([2, 1, 2, ..., 2, 1, 2])

In [75]:
ct = 0
if(y1_pred.all() == y3_pred.all()):
    ct = ct +1 

In [76]:
ct

1