In [9]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import sys
#!{sys.executable} -m pip install xgboost

In [10]:
#reading train and test csv files into train and test
train_data = pd.read_csv("train.csv")
test_data  = pd.read_csv("test.csv")

In [11]:
#Copying pet_id to test_id
test_id = test_data["pet_id"]
train_shape = train_data.shape[0]

In [12]:
#Checking shape of train and test data
train_data.shape,test_data.shape

((18834, 11), (8072, 9))

In [13]:
print(train_data.head(3))
print("***********************************************************************************")
print(test_data.head(3))

       pet_id           issue_date  ... breed_category  pet_category
0  ANSL_69903  2016-07-10 00:00:00  ...            0.0             1
1  ANSL_66892  2013-11-21 00:00:00  ...            0.0             2
2  ANSL_69750  2014-09-28 00:00:00  ...            2.0             4

[3 rows x 11 columns]
***********************************************************************************
       pet_id           issue_date         listing_date  ...  height(cm) X1  X2
0  ANSL_75005  2005-08-17 00:00:00  2017-09-07 15:35:00  ...       42.73  0   7
1  ANSL_76663  2018-11-15 00:00:00  2019-05-08 17:24:00  ...        6.71  0   1
2  ANSL_58259  2012-10-11 00:00:00  2018-04-02 16:51:00  ...       41.21  0   7

[3 rows x 9 columns]


In [14]:
#Type of train data
print(train_data.dtypes)
print("****************************")
print(test_data.dtypes)

pet_id             object
issue_date         object
listing_date       object
condition         float64
color_type         object
length(m)         float64
height(cm)        float64
X1                  int64
X2                  int64
breed_category    float64
pet_category        int64
dtype: object
****************************
pet_id           object
issue_date       object
listing_date     object
condition       float64
color_type       object
length(m)       float64
height(cm)      float64
X1                int64
X2                int64
dtype: object


In [15]:
#checking null values in train and test data
print(train_data.isnull().sum())
print("***********************")
print(test_data.isnull().sum())

pet_id               0
issue_date           0
listing_date         0
condition         1477
color_type           0
length(m)            0
height(cm)           0
X1                   0
X2                   0
breed_category       0
pet_category         0
dtype: int64
***********************
pet_id            0
issue_date        0
listing_date      0
condition       619
color_type        0
length(m)         0
height(cm)        0
X1                0
X2                0
dtype: int64


In [16]:
#Checking target data values
print(train_data['breed_category'].value_counts())
print(train_data['pet_category'].value_counts())

0.0    9000
1.0    8357
2.0    1477
Name: breed_category, dtype: int64
2    10621
1     7184
4      941
0       88
Name: pet_category, dtype: int64


In [17]:
col_train = train_data['color_type'].value_counts().to_dict()
col_test  = test_data['color_type'].value_counts().to_dict()
print('Values count of color type for train : \n',col_train)
print('Values count of color type for test : \n',col_test)

Values count of color type for train : 
 {'Black': 4620, 'White': 2453, 'Brown': 1791, 'Brown Tabby': 1687, 'Tan': 1349, 'Blue': 852, 'Orange Tabby': 791, 'Red': 526, 'Brown Brindle': 496, 'Tricolor': 469, 'Blue Tabby': 386, 'Tortie': 366, 'Calico': 343, 'Gray': 307, 'Chocolate': 259, 'Torbie': 242, 'Cream Tabby': 191, 'Sable': 167, 'Cream': 162, 'Fawn': 159, 'Yellow': 143, 'Buff': 125, 'Lynx Point': 117, 'Blue Merle': 104, 'Seal Point': 78, 'Black Brindle': 66, 'Gray Tabby': 65, 'Black Tabby': 55, 'Flame Point': 52, 'Orange': 39, 'Brown Merle': 39, 'Black Smoke': 32, 'Gold': 31, 'Tortie Point': 26, 'Silver': 24, 'Red Tick': 23, 'Blue Tick': 21, 'Blue Point': 20, 'Lilac Point': 19, 'Silver Tabby': 18, 'Yellow Brindle': 15, 'Apricot': 13, 'Red Merle': 13, 'Calico Point': 12, 'Blue Tiger': 10, 'Blue Cream': 10, 'Chocolate Point': 9, 'Pink': 8, 'Green': 8, 'Blue Smoke': 6, 'Agouti': 4, 'Brown Tiger': 4, 'Silver Lynx Point': 4, 'Liver': 3, 'Black Tiger': 1, 'Liver Tick': 1}
Values count of

In [18]:
#Checking whether the color_types values are same in test data and train data 
train_data["color_type"].isin(test_data["color_type"]).value_counts()

True     18829
False        5
Name: color_type, dtype: int64

In [19]:
#Checking whether the color_types values are same in test data and train data 
color = list(set(train_data['color_type'])-set(test_data['color_type']))
print(color)

['Black Tiger', 'Brown Tiger']


In [20]:
#Checking if there is any relation between Breed Category and Pet Category to condition as we have multiple null values in condition
con_breed = train_data['breed_category'][(np.isnan(train_data['condition']))]
con_pet   = train_data['pet_category'][(np.isnan(train_data['condition']))]
print(con_breed.value_counts())
print(con_pet.value_counts())

2.0    1477
Name: breed_category, dtype: int64
4    783
1    583
2     60
0     51
Name: pet_category, dtype: int64


In [21]:
#Filling missing values in test data
train_data['condition'].fillna(-1,inplace=True)
train_data['condition'].value_counts()

 1.0    6819
 0.0    6281
 2.0    4257
-1.0    1477
Name: condition, dtype: int64

In [22]:
#Filling missing values in test data
test_data['condition'].fillna(-1,inplace=True)
test_data['condition'].value_counts()

 1.0    2928
 0.0    2685
 2.0    1840
-1.0     619
Name: condition, dtype: int64

In [23]:
train_data['breed_category'] = train_data['breed_category'].astype(np.int64)

In [24]:
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
train_data['listing_date'] = pd.to_datetime(train_data['listing_date'])
train_data['duration'] = (train_data['listing_date'] - train_data['issue_date']).dt.days

In [25]:
test_data['issue_date'] = pd.to_datetime(test_data['issue_date'])
test_data['listing_date'] = pd.to_datetime(test_data['listing_date'])
test_data['duration'] = (test_data['listing_date'] - test_data['issue_date']).dt.days

In [26]:
#Dropping the columns in data
train =  train_data.drop(['pet_id','issue_date','listing_date','breed_category','pet_category'],axis=1)
test =  test_data.drop(['pet_id','issue_date','listing_date'],axis=1)

In [27]:
y_breed = train_data['breed_category']
y_pet = train_data['pet_category']

In [28]:
train.shape,test.shape

((18834, 7), (8072, 7))

In [29]:
#Using dummies values approach
train = pd.get_dummies(train)
test  = pd.get_dummies(test)

In [30]:
train.shape,test.shape

((18834, 62), (8072, 60))

In [31]:
cols_drop = list(set(train.columns)-set(test.columns))
#We have seen the same values in color

In [32]:
train = train.drop(cols_drop,axis=1)

In [33]:
train.shape,test.shape

((18834, 60), (8072, 60))

In [34]:
data = pd.concat((train, test)).reset_index(drop=True)

In [35]:
data.shape

(26906, 60)

In [36]:
from sklearn.preprocessing import StandardScaler
names = data.columns
scaler= StandardScaler()
# Fit your data on the scaler object
scaler = scaler.fit_transform(data)
data   = pd.DataFrame(scaler, columns=names)

In [37]:
train_1 = data[:train_shape]
test_1  = data[train_shape:]

In [38]:
#Splitting the data into train and validation
from sklearn.model_selection import train_test_split
X_train_pet, X_val_pet, y_pet_train, y_val_pet = train_test_split(train_1, y_pet,test_size=0.3, random_state=10)

In [39]:
#Gradient Boosting Classifier for breed Category
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
gbk_1 = GradientBoostingClassifier()
gbk_1.fit(X_train_pet,y_pet_train)
feature_new = gbk_1.predict(train_1)
output_1 = gbk_1.predict(test_1)
valid_pet= gbk_1.predict(X_val_pet)

In [40]:
train_2 = pd.DataFrame(train_1, columns=names)
test_2  = pd.DataFrame(test_1,  columns=names)

In [41]:
train_2['output_1'] = feature_new
test_2['output_1'] = output_1

In [42]:
X_train_breed, X_val_breed, y_breed_train, y_val_breed = train_test_split(train_2, y_breed,test_size=0.3, random_state=10)

In [43]:
gbk_2 = GradientBoostingClassifier()
gbk_2.fit(X_train_breed,y_breed_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [44]:
output_2 = gbk_2.predict(test_1)
#vld2 is validation 2 i.e we'll check score with the predicted result of validation data of model 2
valid_breed = gbk_2.predict(X_val_breed)

In [45]:
from sklearn.metrics  import f1_score
s1=f1_score(y_val_breed,valid_breed,average='weighted')
s2=f1_score(y_val_pet,valid_pet,average='weighted')
accuracy=100*((s1+s2)/2)
print(accuracy)

90.15793865067332


In [46]:
submission = pd.DataFrame({"pet_id":test_id,"breed_category":output_2,"pet_category":output_1
})
submission.to_csv('Sample Submission GBK v1.csv', index=False)