In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

sns.set_style('whitegrid')

# Problem definition

Predict when a pet will be adopted

# Load the data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_state = pd.read_csv('state_labels.csv')
df_breed = pd.read_csv('breed_labels.csv')
df_color = pd.read_csv('color_labels.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


# Feature Engineering

In [3]:
df_train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,...,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,...,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,...,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,...,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,...,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [4]:
df_breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [5]:
df_state.head()

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan


In [6]:
df_color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


# Clean the data 

In [7]:
df_train['Type'] = df_train['Type'].replace(to_replace = [1,2], value=['Dog', 'Cat'])
df_train['Gender'] = df_train['Gender'].replace(to_replace = [1,2,3], value=['Male', 'Female','Mixed'])
df_train['MaturitySize'] = df_train['MaturitySize'].replace(to_replace = [0,1,2,3,4], value=['Not Specified','Small', 'Medium','Large','Extra Large'])
df_train['FurLength'] = df_train['FurLength'].replace(to_replace = [0,1,2,3], value=['Not Specified','Short', 'Medium','Long'])
df_train['Vaccinated'] = df_train['Vaccinated'].replace(to_replace = [1,2,3], value=['Yes','No','Not Sure'])
df_train['Dewormed'] = df_train['Dewormed'].replace(to_replace = [1,2,3], value=['Yes','No','Not Sure'])
df_train['Sterilized'] = df_train['Sterilized'].replace(to_replace = [1,2,3], value=['Yes','No','Not Sure'])
df_train['Health'] = df_train['Health'].replace(to_replace = [0,1,2,3], value=['Not Specified','Healthy','Minor Injury','Serious Injury'])
df_train.rename(columns={'Type': 'Pet_Type'})

Unnamed: 0,Pet_Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,Dog,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,Female,2,7,0,Medium,...,Healthy,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,Cat,Cookie,3,266,0,Male,6,7,0,Medium,...,Healthy,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,Cat,Favour Speedy Abundance And Courage,7,250,252,Male,1,2,0,Medium,...,Healthy,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,Dog,,3,307,0,Male,2,0,0,Large,...,Healthy,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,Cat,Abandoned Kitty,1,266,0,Male,1,6,7,Small,...,Healthy,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2
5,Dog,Duke,3,218,0,Male,3,5,0,Medium,...,Healthy,1,0,41326,aa66486163b6cbc25ea62a34b11c9b91,0,Duke has black fur,965b31ba7,2.0,1
6,Dog,Lila,2,307,0,Female,1,7,0,Small,...,Healthy,1,0,41326,d21f689eab9b3faa1b738ecc836b4b36,0,Lila is a pleasant little girl who we think is...,3760c73b1,1.0,4
7,Dog,Doggie2_Selangor Area,8,307,0,Female,6,0,0,Medium,...,Healthy,1,0,41336,8f955b588a9e571d8e267cd73cdd8a45,0,"Remember my friend – ADOPT, DON’T BUY! Keep in...",f41a7de83,2.0,4
8,Cat,,1,243,245,Female,1,2,7,Small,...,Minor Injury,1,0,41326,2587c9957372fc186d3b95cfd12cf322,0,urgent for adoption as I have no experience in...,7b660c6af,4.0,3
9,Dog,Brother,6,307,0,Male,2,7,0,Small,...,Healthy,1,0,41327,b84a2dd96249074fc4b276e55f608d21,0,"It's very playful and active dog, Like to play...",f94c2a347,2.0,4


# Merging Data Sets

In [8]:
df_merge = df_train.copy()
df_merge = df_merge.merge(df_color, how='outer',
            left_on=['Color1'], right_on =['ColorID']) 
df_merge = df_merge.drop('ColorID', 1)
df_merge = df_merge.rename(columns={'ColorName': 'ColorName1'})

In [9]:
df_merge2 = df_merge.merge(df_color, how='outer',
            left_on=['Color2'], right_on =['ColorID']) 
df_merge2 = df_merge2.drop('ColorID', 1)
df_merge2 = df_merge2.rename(columns={'ColorName': 'ColorName2'})

In [10]:
df_merge3 = df_merge2.merge(df_color, how='outer',
            left_on=['Color3'], right_on =['ColorID']) 
df_merge3 = df_merge3.drop('ColorID', 1)
df_merge3 = df_merge3.rename(columns={'ColorName': 'ColorName3'})

In [11]:
df_merge4 = df_merge3.merge(df_breed, how='left',
            left_on=['Breed1'], right_on =['BreedID']) 
df_merge4 = df_merge4.drop('BreedID',1)
df_merge4 = df_merge4.rename(columns={'BreedName': 'Breed1'})

In [12]:
df_merge5 = df_merge4.merge(df_breed, how='left',
            left_on=['Breed2'], right_on =['BreedID']) 
df_merge5 = df_merge5.drop('BreedID',1)
df_merge5 = df_merge5.rename(columns={'BreedName': 'Breed2'})
df_merge5.head()

Unnamed: 0,Type_x,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,PetID,PhotoAmt,AdoptionSpeed,ColorName1,ColorName2,ColorName3,Type_y,Breed1.1,Type,Breed2.1
0,Dog,â¥â¥â¥ Lily â¥â¥â¥,36.0,307.0,0.0,Female,2.0,7.0,0.0,Medium,...,3f8824a3b,1.0,4.0,Brown,White,,1.0,Mixed Breed,,
1,Dog,Brother,6.0,307.0,0.0,Male,2.0,7.0,0.0,Small,...,f94c2a347,2.0,4.0,Brown,White,,1.0,Mixed Breed,,
2,Dog,Amber,48.0,128.0,0.0,Female,2.0,7.0,0.0,Small,...,bf99a973b,0.0,4.0,Brown,White,,1.0,Jack Russell Terrier,,
3,Dog,Aerin,4.0,18.0,307.0,Female,2.0,7.0,0.0,Medium,...,4c674cc1f,4.0,2.0,Brown,White,,1.0,Basenji,1.0,Mixed Breed
4,Dog,,1.0,307.0,0.0,Mixed,2.0,7.0,0.0,Small,...,631c813af,4.0,1.0,Brown,White,,1.0,Mixed Breed,,


In [13]:
df = df_merge5

In [14]:
# Check for missing values
df.isnull().sum(axis = 0)

Type_x              3
Name              845
Age                 3
Breed1              3
Breed2              3
Gender              3
Color1              3
Color2              3
Color3              3
MaturitySize        3
FurLength           3
Vaccinated          3
Dewormed            3
Sterilized          3
Health              3
Quantity            3
Fee                 3
State               3
RescuerID           3
VideoAmt            3
Description        11
PetID               3
PhotoAmt            3
AdoptionSpeed       3
ColorName1          3
ColorName2       2962
ColorName3       7081
Type_y              5
Breed1              5
Type             7215
Breed2           7215
dtype: int64

In [15]:
# apply dummies on the training set
for col in ['Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize','FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health']:
    df_dummies = pd.get_dummies(df_train[col], prefix=col)
    df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
    df_train = pd.concat([df_train, df_dummies], axis=1)
    del df_train[col]

df_train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Quantity,Fee,State,RescuerID,VideoAmt,...,Vaccinated_Vaccinated_Yes,Dewormed_Dewormed_No,Dewormed_Dewormed_Not Sure,Dewormed_Dewormed_Yes,Sterilized_Sterilized_No,Sterilized_Sterilized_Not Sure,Sterilized_Sterilized_Yes,Health_Health_Healthy,Health_Health_Minor Injury,Health_Health_Serious Injury
0,Dog,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,1,0,41326,337914b09c2fa5460e195197e994ef98,0,...,1,1,0,0,0,0,1,1,0,0
1,Cat,Cookie,3,266,0,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,...,0,0,0,1,1,0,0,1,0,0
2,Cat,Favour Speedy Abundance And Courage,7,250,252,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,...,1,0,0,1,1,0,0,1,0,0
3,Dog,,3,307,0,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,...,0,1,0,0,1,0,0,1,0,0
4,Cat,Abandoned Kitty,1,266,0,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,...,0,1,0,0,1,0,0,1,0,0


In [23]:
df_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Quantity', 'Fee', 'State',
       'RescuerID', 'VideoAmt', 'Description', 'PetID', 'PhotoAmt',
       'AdoptionSpeed', 'Gender_Gender_Female', 'Gender_Gender_Male',
       'Gender_Gender_Mixed', 'Color1_Color1_1', 'Color1_Color1_2',
       'Color1_Color1_3', 'Color1_Color1_4', 'Color1_Color1_5',
       'Color1_Color1_6', 'Color1_Color1_7', 'Color2_Color2_0',
       'Color2_Color2_2', 'Color2_Color2_3', 'Color2_Color2_4',
       'Color2_Color2_5', 'Color2_Color2_6', 'Color2_Color2_7',
       'Color3_Color3_0', 'Color3_Color3_3', 'Color3_Color3_4',
       'Color3_Color3_5', 'Color3_Color3_6', 'Color3_Color3_7',
       'MaturitySize_MaturitySize_Extra Large',
       'MaturitySize_MaturitySize_Large', 'MaturitySize_MaturitySize_Medium',
       'MaturitySize_MaturitySize_Small', 'FurLength_FurLength_Long',
       'FurLength_FurLength_Medium', 'FurLength_FurLength_Short',
       'Vaccinated_Vaccinated_No', 'Vaccinated_Vaccinated_Not Sure',
 

In [35]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 56 columns):
Type                                     10000 non-null object
Name                                     9158 non-null object
Age                                      10000 non-null int64
Breed1                                   10000 non-null int64
Breed2                                   10000 non-null int64
Quantity                                 10000 non-null int64
Fee                                      10000 non-null int64
State                                    10000 non-null int64
RescuerID                                10000 non-null object
VideoAmt                                 10000 non-null int64
Description                              9992 non-null object
PetID                                    10000 non-null object
PhotoAmt                                 10000 non-null float64
AdoptionSpeed                            10000 non-null int64
Gender_Gender_Femal

In [24]:
# select the columns
X_columns = ['Age', 'Fee', 'FurLength_FurLength_Long',
       'FurLength_FurLength_Medium', 'FurLength_FurLength_Short',
       'Vaccinated_Vaccinated_No', 'Vaccinated_Vaccinated_Not Sure',
       'Vaccinated_Vaccinated_Yes', 'Dewormed_Dewormed_No',
       'Dewormed_Dewormed_Not Sure', 'Dewormed_Dewormed_Yes',
       'Sterilized_Sterilized_No', 'Sterilized_Sterilized_Not Sure',
       'Sterilized_Sterilized_Yes', 'Health_Health_Healthy',
       'Health_Health_Minor Injury', 'Health_Health_Serious Injury']
y_column = ['AdoptionSpeed']

# Model Training

In [25]:
# split the data using sklearn

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 17)
y_train (8000, 1)
X_test (2000, 17)
y_test (2000, 1)


In [26]:
# from sklearn.model_selection import cross_val_score
# List = list(range(1,50))
# neighbors = filter(lambda x: x % 2 != 0, List)
# cv_scores = []
# for k in neighbors:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, X_train, y_train.values.ravel(), cv=10, scoring='accuracy')
#     cv_scores.append(scores.mean())

models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('KNeighborsClassifier9', KNeighborsClassifier(n_neighbors=9)),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes
[[  3  37   0   0  13]
 [ 24 319   0   4  60]
 [ 32 377   0   6 101]
 [ 21 288   0   9 110]
 [ 43 319   0  10 224]]
Precision 0.24690685862374387
Recall 0.2775

MODEL RandomForestClassifier10
[[  2   9  18   9  15]
 [  6 109 154  57  81]
 [  7 113 200  91 105]
 [  1  84 150  86 107]
 [  4  93 178  77 244]]
Precision 0.3199678449640726
Recall 0.3205
Feature Importance
                             0         1
0                          Age  0.524872
1                          Fee  0.235814
11    Sterilized_Sterilized_No  0.036264
3   FurLength_FurLength_Medium  0.021586
4    FurLength_FurLength_Short  0.020637
5     Vaccinated_Vaccinated_No  0.019028
2     FurLength_FurLength_Long  0.016476
10       Dewormed_Dewormed_Yes  0.016438
7    Vaccinated_Vaccinated_Yes  0.015866
14       Health_Health_Healthy  0.014958

MODEL RandomForestClassifier100


  'precision', 'predicted', average, warn_for)


[[  2  13  19   8  11]
 [  4 110 166  45  82]
 [  5 105 209  86 111]
 [  1  84 154  80 109]
 [  6  98 173  78 241]]
Precision 0.319608276854066
Recall 0.321
Feature Importance
                             0         1
0                          Age  0.517207
1                          Fee  0.248577
11    Sterilized_Sterilized_No  0.027801
4    FurLength_FurLength_Short  0.021336
3   FurLength_FurLength_Medium  0.021126
13   Sterilized_Sterilized_Yes  0.017479
2     FurLength_FurLength_Long  0.017361
5     Vaccinated_Vaccinated_No  0.016667
10       Dewormed_Dewormed_Yes  0.016650
7    Vaccinated_Vaccinated_Yes  0.015845

MODEL KNeighborsClassifier9
[[  2  13  15   8  15]
 [  2 148 121  56  80]
 [  1 140 165 101 109]
 [  0 103 126  87 112]
 [  1 112 139 107 237]]
Precision 0.3219885744701415
Recall 0.3195

MODEL DecisionTreeClassifier
[[  3  14  21   3  12]
 [  6 123 169  46  63]
 [  5 113 223  82  93]
 [  4  90 168  77  89]
 [ 10 112 186  78 210]]
Precision 0.32484789940298026
Recall 0.

Unnamed: 0,model,precision,recall
4,DecisionTreeClassifier,0.324848,0.318
3,KNeighborsClassifier9,0.321989,0.3195
1,RandomForestClassifier10,0.319968,0.3205
2,RandomForestClassifier100,0.319608,0.321
0,Naive Bayes,0.246907,0.2775


# Model Evaluation

In [27]:
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

kappa 0.1656
[[  3  14  21   3  12]
 [  6 123 169  46  63]
 [  5 113 223  82  93]
 [  4  90 168  77  89]
 [ 10 112 186  78 210]]


Using Cross Validation

In [28]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

Kappa for each fold: [0.1957, 0.1588, 0.2006, 0.1387, 0.225, 0.189, 0.1831, 0.1369, 0.1893, 0.1594]
AVG(kappa) 0.1776
STD(kappa) 0.0269


# Tuning the Thresholds


In [29]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
for i in range(1,10):
    print(i)
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred = [1 if x > i/10.0 else 0 for x in y_pred]
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)

  


1
[[  6  10   0   0   0]
 [ 40 153   0   0   0]
 [ 69 210   0   0   0]
 [ 72 153   0   0   0]
 [110 177   0   0   0]]
Precision 0.04232749974855238
Recall 0.159
2
[[ 10   6   0   0   0]
 [ 72 121   0   0   0]
 [122 157   0   0   0]
 [119 106   0   0   0]
 [185 102   0   0   0]]
Precision 0.047780407784392805
Recall 0.131
3
[[ 13   3   0   0   0]
 [147  46   0   0   0]
 [235  44   0   0   0]
 [185  40   0   0   0]
 [247  40   0   0   0]]
Precision 0.051569430562448004
Recall 0.059
4
[[ 14   2   0   0   0]
 [170  23   0   0   0]
 [265  14   0   0   0]
 [212  13   0   0   0]
 [272  15   0   0   0]]
Precision 0.06649381708819248
Recall 0.037
5
[[ 14   2   0   0   0]
 [178  15   0   0   0]
 [268  11   0   0   0]
 [217   8   0   0   0]
 [280   7   0   0   0]]
Precision 0.06755964618113774
Recall 0.029
6
[[ 14   2   0   0   0]
 [181  12   0   0   0]
 [274   5   0   0   0]
 [220   5   0   0   0]
 [283   4   0   0   0]]
Precision 0.08294473838918283
Recall 0.026
7


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[[ 15   1   0   0   0]
 [186   7   0   0   0]
 [277   2   0   0   0]
 [223   2   0   0   0]
 [284   3   0   0   0]]
Precision 0.0903103214890017
Recall 0.022
8
[[ 15   1   0   0   0]
 [188   5   0   0   0]
 [278   1   0   0   0]
 [223   2   0   0   0]
 [286   1   0   0   0]]
Precision 0.09674242424242425
Recall 0.02
9
[[ 16   0   0   0   0]
 [191   2   0   0   0]
 [278   1   0   0   0]
 [224   1   0   0   0]
 [286   1   0   0   0]]
Precision 0.07745728643216081
Recall 0.018


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Prepare submission

In [None]:
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

In [None]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('submission_knn.csv', index=False)