In [1]:
import pandas as pd
import numpy as np 
from datetime import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split

## Initial data cleaning

In [2]:
pet_df = pd.read_csv('Data/eda_train.csv')
pet_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,X1,X2,log_days_btw_issue_list,listing_year,issue_year,listing_month,issue_month,length(cm),height(cm),size,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,13,9,4.29,2016,2016,9,7,80.0,7.78,622.4,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,13,9,7.53,2018,2013,12,11,72.0,14.19,1021.68,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,-1.0,Brown,15,4,6.62,2016,2014,10,9,15.0,40.9,613.5,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0,1,6.63,2019,2016,1,12,62.0,17.82,1104.84,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,18,4,3.95,2017,2017,11,9,50.0,11.06,553.0,0.0,1


In [3]:
pet_df_test = pd.read_csv('Data/eda_test.csv')
pet_df_test.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,X1,X2,log_days_btw_issue_list,listing_year,issue_year,listing_month,issue_month,length(cm),height(cm),size
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0,7,8.39,2017,2005,9,8,87.0,42.73,3717.51
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0,1,5.16,2019,2018,5,11,6.0,6.71,40.26
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0,7,7.6,2018,2012,4,10,24.0,41.21,989.04
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,7,1,7.05,2018,2015,4,2,29.0,8.46,245.34
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0,7,6.14,2018,2017,4,1,71.0,30.92,2195.32


In [4]:
pet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18802 entries, 0 to 18801
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   pet_id                   18802 non-null  object 
 1   issue_date               18802 non-null  object 
 2   listing_date             18802 non-null  object 
 3   condition                18802 non-null  float64
 4   color_type               18802 non-null  object 
 5   X1                       18802 non-null  int64  
 6   X2                       18802 non-null  int64  
 7   log_days_btw_issue_list  18800 non-null  float64
 8   listing_year             18802 non-null  int64  
 9   issue_year               18802 non-null  int64  
 10  listing_month            18802 non-null  int64  
 11  issue_month              18802 non-null  int64  
 12  length(cm)               18802 non-null  float64
 13  height(cm)               18802 non-null  float64
 14  size                  

In [5]:
pet_df['issue_date'] = pd.to_datetime(pet_df['issue_date']) 
pet_df['listing_date'] = pd.to_datetime(pet_df['listing_date']) 

pet_df_test['issue_date'] = pd.to_datetime(pet_df_test['issue_date']) 
pet_df_test['listing_date'] = pd.to_datetime(pet_df_test['listing_date']) 

In [6]:
pet_df['breed_category'] = pet_df['breed_category'].astype('int')

In [7]:
pet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18802 entries, 0 to 18801
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   pet_id                   18802 non-null  object        
 1   issue_date               18802 non-null  datetime64[ns]
 2   listing_date             18802 non-null  datetime64[ns]
 3   condition                18802 non-null  float64       
 4   color_type               18802 non-null  object        
 5   X1                       18802 non-null  int64         
 6   X2                       18802 non-null  int64         
 7   log_days_btw_issue_list  18800 non-null  float64       
 8   listing_year             18802 non-null  int64         
 9   issue_year               18802 non-null  int64         
 10  listing_month            18802 non-null  int64         
 11  issue_month              18802 non-null  int64         
 12  length(cm)               18802 n

In [8]:
pet_df.describe()

Unnamed: 0,condition,X1,X2,log_days_btw_issue_list,listing_year,issue_year,listing_month,issue_month,length(cm),height(cm),size,breed_category,pet_category
count,18802.0,18802.0,18802.0,18800.0,18802.0,18802.0,18802.0,18802.0,18802.0,18802.0,18802.0,18802.0,18802.0
mean,0.735613,5.370812,4.574194,5.986159,2017.427242,2015.107542,6.767259,6.824593,50.248271,27.452333,1377.593204,0.600574,1.727954
std,0.896445,6.571212,3.519026,1.302531,0.945428,3.033127,3.599103,3.300231,28.870241,13.021942,1093.952869,0.630008,0.725338
min,-1.0,0.0,0.0,2.94,2015.0,2001.0,1.0,1.0,0.0,5.0,0.0,0.0,1.0
25%,0.0,0.0,1.0,4.78,2017.0,2014.0,4.0,4.0,25.0,16.18,494.3675,0.0,1.0
50%,1.0,0.0,4.0,5.97,2017.0,2016.0,7.0,7.0,50.0,27.33,1093.855,1.0,2.0
75%,1.0,13.0,9.0,7.02,2018.0,2017.0,10.0,10.0,76.0,38.89,2047.47,1.0,2.0
max,2.0,19.0,9.0,8.8,2019.0,2019.0,12.0,12.0,100.0,50.0,4978.0,2.0,4.0


## Data Preprocessing

In [9]:
pet_df_test.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,X1,X2,log_days_btw_issue_list,listing_year,issue_year,listing_month,issue_month,length(cm),height(cm),size
0,ANSL_75005,2005-08-17,2017-09-07 15:35:00,0.0,Black,0,7,8.39,2017,2005,9,8,87.0,42.73,3717.51
1,ANSL_76663,2018-11-15,2019-05-08 17:24:00,1.0,Orange Tabby,0,1,5.16,2019,2018,5,11,6.0,6.71,40.26
2,ANSL_58259,2012-10-11,2018-04-02 16:51:00,1.0,Black,0,7,7.6,2018,2012,4,10,24.0,41.21,989.04
3,ANSL_67171,2015-02-13,2018-04-06 07:25:00,1.0,Black,7,1,7.05,2018,2015,4,2,29.0,8.46,245.34
4,ANSL_72871,2017-01-18,2018-04-26 13:42:00,1.0,Brown,0,7,6.14,2018,2017,4,1,71.0,30.92,2195.32


In [10]:
pre_pro = pet_df.copy()
pre_pro2 = pet_df_test.copy()

from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()

class_le.fit(pre_pro.color_type.values)

pre_pro['color_type_enc'] = class_le.transform(pre_pro.color_type.values)
pre_pro2['color_type_enc'] = class_le.transform(pre_pro2.color_type.values)

In [11]:
pre_pro.isnull().sum()

pet_id                     0
issue_date                 0
listing_date               0
condition                  0
color_type                 0
X1                         0
X2                         0
log_days_btw_issue_list    2
listing_year               0
issue_year                 0
listing_month              0
issue_month                0
length(cm)                 0
height(cm)                 0
size                       0
breed_category             0
pet_category               0
color_type_enc             0
dtype: int64

In [12]:
pre_pro.log_days_btw_issue_list.fillna(10,inplace = True)

In [13]:
pre_pro2.isnull().sum()

pet_id                     0
issue_date                 0
listing_date               0
condition                  0
color_type                 0
X1                         0
X2                         0
log_days_btw_issue_list    0
listing_year               0
issue_year                 0
listing_month              0
issue_month                0
length(cm)                 0
height(cm)                 0
size                       0
color_type_enc             0
dtype: int64

## Predicting for pet_category

In [14]:
X = pre_pro.drop(['breed_category','pet_category','pet_id','issue_date','listing_date','color_type','issue_year','listing_year','size'], axis = 1)
y = pre_pro['pet_category']

In [15]:
from sklearn import tree, metrics
from sklearn.metrics import f1_score

gini_model = tree.DecisionTreeClassifier(criterion ='gini')

# Do the fit, predict, and series transformations as before. 
gini_model.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
print(dict(zip(X.columns, gini_model.feature_importances_)))

{'condition': 0.050552877305772785, 'X1': 0.16859574189159923, 'X2': 0.022217079342190523, 'log_days_btw_issue_list': 0.21411353628115218, 'listing_month': 0.029556038804939996, 'issue_month': 0.047010903171490204, 'length(cm)': 0.057824755552503965, 'height(cm)': 0.0696940813822119, 'color_type_enc': 0.3404349862681392}


In [17]:
#X = pre_pro[['condition','color_type_enc','length(cm)','height(cm)','X1','X2','log_days_btw_issue_list']]
y = pre_pro['pet_category']
X2 = pre_pro2[list(X.columns)]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 46)

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()

params = {
        #'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 2],
        #'n_estimators': [100, 250],
        'learning_rate': [0.1, 0.5, 1],
        'max_depth': [3, 4, 6]
        }

grid = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=3 )

# fit the model with the training data
grid.fit(X_train,y_train) 

acc = grid.score(X_test, y_test)
print("Tuned Logistic Regression C: {}".format(grid.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(acc))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] gamma=0.5, learning_rate=0.1, max_depth=3 .......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  gamma=0.5, learning_rate=0.1, max_depth=3, score=0.888, total=   1.6s
[CV] gamma=0.5, learning_rate=0.1, max_depth=3 .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  gamma=0.5, learning_rate=0.1, max_depth=3, score=0.899, total=   1.5s
[CV] gamma=0.5, learning_rate=0.1, max_depth=3 .......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.0s remaining:    0.0s


[CV]  gamma=0.5, learning_rate=0.1, max_depth=3, score=0.889, total=   1.5s
[CV] gamma=0.5, learning_rate=0.1, max_depth=4 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=4, score=0.893, total=   1.8s
[CV] gamma=0.5, learning_rate=0.1, max_depth=4 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=4, score=0.907, total=   2.0s
[CV] gamma=0.5, learning_rate=0.1, max_depth=4 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=4, score=0.900, total=   2.0s
[CV] gamma=0.5, learning_rate=0.1, max_depth=6 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=6, score=0.899, total=   2.9s
[CV] gamma=0.5, learning_rate=0.1, max_depth=6 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=6, score=0.912, total=   3.0s
[CV] gamma=0.5, learning_rate=0.1, max_depth=6 .......................
[CV]  gamma=0.5, learning_rate=0.1, max_depth=6, score=0.905, total=   2.8s
[CV] gamma=0.5, learning_rate=0.5, max_dep

[CV]  gamma=2, learning_rate=0.1, max_depth=4, score=0.898, total=   2.2s
[CV] gamma=2, learning_rate=0.1, max_depth=6 .........................
[CV]  gamma=2, learning_rate=0.1, max_depth=6, score=0.899, total=   3.1s
[CV] gamma=2, learning_rate=0.1, max_depth=6 .........................
[CV]  gamma=2, learning_rate=0.1, max_depth=6, score=0.912, total=   3.0s
[CV] gamma=2, learning_rate=0.1, max_depth=6 .........................
[CV]  gamma=2, learning_rate=0.1, max_depth=6, score=0.902, total=   3.0s
[CV] gamma=2, learning_rate=0.5, max_depth=3 .........................
[CV]  gamma=2, learning_rate=0.5, max_depth=3, score=0.900, total=   1.6s
[CV] gamma=2, learning_rate=0.5, max_depth=3 .........................
[CV]  gamma=2, learning_rate=0.5, max_depth=3, score=0.909, total=   1.5s
[CV] gamma=2, learning_rate=0.5, max_depth=3 .........................
[CV]  gamma=2, learning_rate=0.5, max_depth=3, score=0.889, total=   1.4s
[CV] gamma=2, learning_rate=0.5, max_depth=4 ...........

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  2.9min finished


Tuned Logistic Regression C: {'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 6}
Tuned Logistic Regression Accuracy: 0.9070410550946607


In [19]:
# predict the target on the train dataset
y_pred = grid.predict(X_train)
print('\nTarget on train data',y_pred) 

# Accuray Score on train dataset
print(f1_score(y_train, y_pred, average='weighted'))

# predict the target on the test dataset
y_pred = grid.predict(X_test)
print('\nTarget on test data',y_pred) 

# Accuracy Score on test dataset
print(f1_score(y_test, y_pred, average='weighted'))


Target on train data [2 2 2 ... 2 2 2]
0.9257963820338995

Target on test data [2 1 2 ... 2 2 1]
0.9061283377962203


In [20]:
# predict the target on the test dataset
y_pred = grid.predict(X2)
print('\nTarget on test data',y_pred) 

pre_pro2['pet_category'] = y_pred


Target on test data [2 1 2 ... 2 4 2]


In [21]:
pre_pro2.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,X1,X2,log_days_btw_issue_list,listing_year,issue_year,listing_month,issue_month,length(cm),height(cm),size,color_type_enc,pet_category
0,ANSL_75005,2005-08-17,2017-09-07 15:35:00,0.0,Black,0,7,8.39,2017,2005,9,8,87.0,42.73,3717.51,2,2
1,ANSL_76663,2018-11-15,2019-05-08 17:24:00,1.0,Orange Tabby,0,1,5.16,2019,2018,5,11,6.0,6.71,40.26,38,1
2,ANSL_58259,2012-10-11,2018-04-02 16:51:00,1.0,Black,0,7,7.6,2018,2012,4,10,24.0,41.21,989.04,2,2
3,ANSL_67171,2015-02-13,2018-04-06 07:25:00,1.0,Black,7,1,7.05,2018,2015,4,2,29.0,8.46,245.34,2,2
4,ANSL_72871,2017-01-18,2018-04-26 13:42:00,1.0,Brown,0,7,6.14,2018,2017,4,1,71.0,30.92,2195.32,15,2


## Predicting breed_category

In [22]:
X = pre_pro.drop(['breed_category','pet_id','issue_date','listing_date','color_type','pet_category','size'], axis = 1)
y = pre_pro['breed_category']

In [23]:
from sklearn import tree, metrics
from sklearn.metrics import f1_score

gini_model = tree.DecisionTreeClassifier(criterion ='gini')

# Do the fit, predict, and series transformations as before. 
gini_model.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [24]:
print(dict(zip(X.columns, gini_model.feature_importances_)))

{'condition': 0.8013834415266059, 'X1': 0.007304759344292762, 'X2': 0.004491546382830319, 'log_days_btw_issue_list': 0.0286835418488503, 'listing_year': 0.008656687167504102, 'issue_year': 0.010271356741895808, 'listing_month': 0.013403663923487666, 'issue_month': 0.016188938434408266, 'length(cm)': 0.03856707408842737, 'height(cm)': 0.04395068151090591, 'color_type_enc': 0.027098309030791495}


In [25]:
X = pre_pro[['condition','color_type_enc','length(cm)','height(cm)','X1','X2','log_days_btw_issue_list']]
y = pre_pro['breed_category']
X2 = pre_pro2[list(X.columns)]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 46)

In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

model = XGBClassifier()

params = {
        'min_child_weight': [1, 5, 10],
        #'gamma': [0.5, 1, 2],
        #'n_estimators': [100, 250],
        'learning_rate': [0.1, 0.5, 1],
        'max_depth': [3, 4, 6]
        }

grid = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=3 )

# fit the model with the training data
grid.fit(X_train,y_train) 

acc = grid.score(X_test, y_test)
print("Tuned Logistic Regression C: {}".format(grid.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(acc))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.902, total=   1.1s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.894, total=   1.2s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.904, total=   1.0s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5 ..............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, score=0.903, total=   2.3s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5 ..............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, score=0.894, total=   1.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5 ..............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, score=0.904, total=   1.5s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=10, score=0.903, total=   1.3s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=10, score=0.897, total=   1.0s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=10, sco

[CV]  learning_rate=1, max_depth=3, min_child_weight=1, score=0.897, total=   0.7s
[CV] learning_rate=1, max_depth=3, min_child_weight=1 ................
[CV]  learning_rate=1, max_depth=3, min_child_weight=1, score=0.903, total=   0.8s
[CV] learning_rate=1, max_depth=3, min_child_weight=5 ................
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, score=0.897, total=   0.7s
[CV] learning_rate=1, max_depth=3, min_child_weight=5 ................
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, score=0.893, total=   0.7s
[CV] learning_rate=1, max_depth=3, min_child_weight=5 ................
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, score=0.896, total=   0.7s
[CV] learning_rate=1, max_depth=3, min_child_weight=10 ...............
[CV]  learning_rate=1, max_depth=3, min_child_weight=10, score=0.902, total=   0.7s
[CV] learning_rate=1, max_depth=3, min_child_weight=10 ...............
[CV]  learning_rate=1, max_depth=3, min_child_weight=10, score=0.895, total

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  3.0min finished


Tuned Logistic Regression C: {'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1}
Tuned Logistic Regression Accuracy: 0.9021484790470112


In [27]:
# predict the target on the train dataset
y_pred = grid.predict(X_train)
print('\nTarget on train data',y_pred) 

# Accuray Score on train dataset
print(f1_score(y_train, y_pred, average='weighted'))

# predict the target on the test dataset
y_pred = grid.predict(X_test)
print('\nTarget on test data',y_pred) 

# Accuracy Score on test dataset
print(f1_score(y_test, y_pred, average='weighted'))


Target on train data [1 1 0 ... 0 0 0]
0.9248058286241904

Target on test data [0 0 0 ... 0 1 0]
0.9019942022862308


In [28]:
y_pred = grid.predict(X2)
print('\nTarget on test data',y_pred) 

pre_pro2['breed_category'] = y_pred


Target on test data [1 0 0 ... 1 2 1]


In [29]:
ans = pre_pro2[['pet_id','breed_category','pet_category']]
ans.to_csv('data/ans.csv',index = False)