In [10]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
df1 = pd.read_csv('dataset_v1').set_index('building_id')


train_labels = pd.read_csv('train_labels.csv').set_index('building_id')

In [12]:
df1_cats = [
    'land_surface_condition',
    'foundation_type',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
]

for cat in df1_cats:
    df1[cat] = df1[cat].astype('category')

In [45]:
# df1 es mi set de entrenamiento
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  int64   
 1   geo_level_2_id                          260601 non-null  int64   
 2   geo_level_3_id                          260601 non-null  int64   
 3   count_floors_pre_eq                     260601 non-null  int64   
 4   age                                     260601 non-null  int64   
 5   area_percentage                         260601 non-null  int64   
 6   height_percentage                       260601 non-null  int64   
 7   land_surface_condition                  260601 non-null  category
 8   foundation_type                         260601 non-null  category
 9   roof_type                               260601 non-null  category
 10  ground_floor_type          

In [14]:
# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

from catboost import Pool, CatBoostClassifier

x, y = df1.iloc[:,:-1], df1.iloc[:, -1]

In [15]:
param_grid = {
    'learning_rate': np.linspace(0.1, 1.0, 10),
    'depth': list(range(2, 7))
}

catboost_model = CatBoostClassifier(
    iterations=100,
    random_state=2018,
    loss_function='MultiClass',
    cat_features=df1_cats
)

gs = GridSearchCV(catboost_model, param_grid, cv=5, verbose=3)

In [16]:
gs.fit(x,y)
print('Finished training!')

Finished training!


In [20]:
gs.best_params_

{'depth': 6, 'learning_rate': 0.9}

In [None]:
import pickle

filename = 'catboost_full_gs.sav'
pickle.dump(gs, open(filename, 'wb'))

In [23]:
new_catboost = CatBoostClassifier(
    depth=6,
    learning_rate=0.9,
    random_state=2018,
    loss_function='MultiClass',
    cat_features=df1_cats
)

new_catboost.fit(x, y)

0:	learn: 0.8207910	total: 462ms	remaining: 7m 41s
1:	learn: 0.7572843	total: 867ms	remaining: 7m 12s
2:	learn: 0.7359172	total: 1.25s	remaining: 6m 55s
3:	learn: 0.7275144	total: 1.63s	remaining: 6m 46s
4:	learn: 0.7209185	total: 2.01s	remaining: 6m 39s
5:	learn: 0.7114220	total: 2.42s	remaining: 6m 41s
6:	learn: 0.7065685	total: 2.8s	remaining: 6m 37s
7:	learn: 0.7008996	total: 3.19s	remaining: 6m 35s
8:	learn: 0.6971423	total: 3.53s	remaining: 6m 28s
9:	learn: 0.6953835	total: 3.95s	remaining: 6m 31s
10:	learn: 0.6943631	total: 4.33s	remaining: 6m 29s
11:	learn: 0.6917324	total: 4.71s	remaining: 6m 27s
12:	learn: 0.6896344	total: 5.11s	remaining: 6m 28s
13:	learn: 0.6876958	total: 5.51s	remaining: 6m 27s
14:	learn: 0.6840377	total: 5.9s	remaining: 6m 27s
15:	learn: 0.6821654	total: 6.26s	remaining: 6m 25s
16:	learn: 0.6803858	total: 6.63s	remaining: 6m 23s
17:	learn: 0.6786964	total: 7.03s	remaining: 6m 23s
18:	learn: 0.6771181	total: 7.42s	remaining: 6m 23s
19:	learn: 0.6757309	tot

159:	learn: 0.6015900	total: 1m 4s	remaining: 5m 40s
160:	learn: 0.6013398	total: 1m 5s	remaining: 5m 40s
161:	learn: 0.6012667	total: 1m 5s	remaining: 5m 39s
162:	learn: 0.6009808	total: 1m 6s	remaining: 5m 38s
163:	learn: 0.6006725	total: 1m 6s	remaining: 5m 38s
164:	learn: 0.6005686	total: 1m 6s	remaining: 5m 37s
165:	learn: 0.5999861	total: 1m 7s	remaining: 5m 37s
166:	learn: 0.5996911	total: 1m 7s	remaining: 5m 37s
167:	learn: 0.5995802	total: 1m 7s	remaining: 5m 36s
168:	learn: 0.5992397	total: 1m 8s	remaining: 5m 36s
169:	learn: 0.5988752	total: 1m 8s	remaining: 5m 36s
170:	learn: 0.5985996	total: 1m 9s	remaining: 5m 35s
171:	learn: 0.5982247	total: 1m 9s	remaining: 5m 35s
172:	learn: 0.5980730	total: 1m 10s	remaining: 5m 34s
173:	learn: 0.5976361	total: 1m 10s	remaining: 5m 34s
174:	learn: 0.5972560	total: 1m 10s	remaining: 5m 34s
175:	learn: 0.5970352	total: 1m 11s	remaining: 5m 33s
176:	learn: 0.5969046	total: 1m 11s	remaining: 5m 33s
177:	learn: 0.5965991	total: 1m 12s	remai

312:	learn: 0.5723546	total: 2m 6s	remaining: 4m 38s
313:	learn: 0.5722327	total: 2m 7s	remaining: 4m 38s
314:	learn: 0.5720871	total: 2m 7s	remaining: 4m 37s
315:	learn: 0.5719826	total: 2m 8s	remaining: 4m 37s
316:	learn: 0.5718948	total: 2m 8s	remaining: 4m 36s
317:	learn: 0.5717481	total: 2m 8s	remaining: 4m 36s
318:	learn: 0.5714929	total: 2m 9s	remaining: 4m 36s
319:	learn: 0.5712118	total: 2m 9s	remaining: 4m 35s
320:	learn: 0.5710634	total: 2m 10s	remaining: 4m 35s
321:	learn: 0.5708307	total: 2m 10s	remaining: 4m 34s
322:	learn: 0.5706809	total: 2m 10s	remaining: 4m 34s
323:	learn: 0.5704563	total: 2m 11s	remaining: 4m 34s
324:	learn: 0.5703112	total: 2m 11s	remaining: 4m 33s
325:	learn: 0.5700786	total: 2m 12s	remaining: 4m 33s
326:	learn: 0.5699647	total: 2m 12s	remaining: 4m 32s
327:	learn: 0.5696830	total: 2m 13s	remaining: 4m 32s
328:	learn: 0.5696195	total: 2m 13s	remaining: 4m 32s
329:	learn: 0.5694865	total: 2m 13s	remaining: 4m 31s
330:	learn: 0.5694587	total: 2m 14s	

465:	learn: 0.5502295	total: 3m 14s	remaining: 3m 42s
466:	learn: 0.5500888	total: 3m 14s	remaining: 3m 42s
467:	learn: 0.5499867	total: 3m 15s	remaining: 3m 42s
468:	learn: 0.5498541	total: 3m 15s	remaining: 3m 41s
469:	learn: 0.5497875	total: 3m 16s	remaining: 3m 41s
470:	learn: 0.5496555	total: 3m 16s	remaining: 3m 40s
471:	learn: 0.5495242	total: 3m 17s	remaining: 3m 40s
472:	learn: 0.5494165	total: 3m 17s	remaining: 3m 40s
473:	learn: 0.5493843	total: 3m 17s	remaining: 3m 39s
474:	learn: 0.5492490	total: 3m 18s	remaining: 3m 39s
475:	learn: 0.5491627	total: 3m 18s	remaining: 3m 38s
476:	learn: 0.5490868	total: 3m 19s	remaining: 3m 38s
477:	learn: 0.5489929	total: 3m 19s	remaining: 3m 38s
478:	learn: 0.5489252	total: 3m 20s	remaining: 3m 37s
479:	learn: 0.5488771	total: 3m 20s	remaining: 3m 37s
480:	learn: 0.5487309	total: 3m 20s	remaining: 3m 36s
481:	learn: 0.5486758	total: 3m 21s	remaining: 3m 36s
482:	learn: 0.5484415	total: 3m 21s	remaining: 3m 35s
483:	learn: 0.5484051	total:

618:	learn: 0.5324379	total: 4m 21s	remaining: 2m 41s
619:	learn: 0.5324211	total: 4m 22s	remaining: 2m 40s
620:	learn: 0.5323327	total: 4m 22s	remaining: 2m 40s
621:	learn: 0.5322865	total: 4m 22s	remaining: 2m 39s
622:	learn: 0.5321564	total: 4m 23s	remaining: 2m 39s
623:	learn: 0.5320024	total: 4m 23s	remaining: 2m 38s
624:	learn: 0.5318366	total: 4m 24s	remaining: 2m 38s
625:	learn: 0.5317348	total: 4m 24s	remaining: 2m 38s
626:	learn: 0.5316394	total: 4m 25s	remaining: 2m 37s
627:	learn: 0.5314475	total: 4m 25s	remaining: 2m 37s
628:	learn: 0.5313836	total: 4m 26s	remaining: 2m 36s
629:	learn: 0.5312365	total: 4m 26s	remaining: 2m 36s
630:	learn: 0.5312122	total: 4m 27s	remaining: 2m 36s
631:	learn: 0.5310574	total: 4m 27s	remaining: 2m 35s
632:	learn: 0.5309777	total: 4m 27s	remaining: 2m 35s
633:	learn: 0.5308525	total: 4m 28s	remaining: 2m 34s
634:	learn: 0.5307752	total: 4m 28s	remaining: 2m 34s
635:	learn: 0.5306632	total: 4m 29s	remaining: 2m 34s
636:	learn: 0.5305790	total:

771:	learn: 0.5184961	total: 5m 29s	remaining: 1m 37s
772:	learn: 0.5184573	total: 5m 29s	remaining: 1m 36s
773:	learn: 0.5184144	total: 5m 30s	remaining: 1m 36s
774:	learn: 0.5183156	total: 5m 30s	remaining: 1m 35s
775:	learn: 0.5181821	total: 5m 30s	remaining: 1m 35s
776:	learn: 0.5181456	total: 5m 31s	remaining: 1m 35s
777:	learn: 0.5181091	total: 5m 31s	remaining: 1m 34s
778:	learn: 0.5179819	total: 5m 32s	remaining: 1m 34s
779:	learn: 0.5179410	total: 5m 32s	remaining: 1m 33s
780:	learn: 0.5178348	total: 5m 33s	remaining: 1m 33s
781:	learn: 0.5177939	total: 5m 33s	remaining: 1m 33s
782:	learn: 0.5176710	total: 5m 34s	remaining: 1m 32s
783:	learn: 0.5176166	total: 5m 34s	remaining: 1m 32s
784:	learn: 0.5175221	total: 5m 35s	remaining: 1m 31s
785:	learn: 0.5174329	total: 5m 35s	remaining: 1m 31s
786:	learn: 0.5172449	total: 5m 35s	remaining: 1m 30s
787:	learn: 0.5171609	total: 5m 36s	remaining: 1m 30s
788:	learn: 0.5170885	total: 5m 36s	remaining: 1m 30s
789:	learn: 0.5170271	total:

926:	learn: 0.5041993	total: 6m 38s	remaining: 31.4s
927:	learn: 0.5041188	total: 6m 39s	remaining: 31s
928:	learn: 0.5040488	total: 6m 39s	remaining: 30.6s
929:	learn: 0.5039474	total: 6m 40s	remaining: 30.1s
930:	learn: 0.5038669	total: 6m 40s	remaining: 29.7s
931:	learn: 0.5037074	total: 6m 41s	remaining: 29.3s
932:	learn: 0.5036150	total: 6m 41s	remaining: 28.8s
933:	learn: 0.5035544	total: 6m 42s	remaining: 28.4s
934:	learn: 0.5034641	total: 6m 42s	remaining: 28s
935:	learn: 0.5033964	total: 6m 42s	remaining: 27.6s
936:	learn: 0.5033189	total: 6m 43s	remaining: 27.1s
937:	learn: 0.5032576	total: 6m 43s	remaining: 26.7s
938:	learn: 0.5032126	total: 6m 44s	remaining: 26.3s
939:	learn: 0.5031821	total: 6m 44s	remaining: 25.8s
940:	learn: 0.5030869	total: 6m 45s	remaining: 25.4s
941:	learn: 0.5029916	total: 6m 45s	remaining: 25s
942:	learn: 0.5029480	total: 6m 46s	remaining: 24.6s
943:	learn: 0.5028946	total: 6m 46s	remaining: 24.1s
944:	learn: 0.5028510	total: 6m 47s	remaining: 23.7s

<catboost.core.CatBoostClassifier at 0x7f11df588438>

In [24]:
test_values = pd.read_csv('test_values.csv').set_index('building_id')

In [35]:
for cat in df1_cats:
    test_values[cat] = test_values[cat].astype('category')
    
for col in test_values.columns:
    if col.startswith('has_secondary_use'):
        test_values[col] = test_values[col].astype('bool')
    if col.startswith('has_superstructure'):
        test_values[col] = test_values[col].astype('bool')
test_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86868 entries, 300051 to 501372
Data columns (total 38 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   geo_level_1_id                          86868 non-null  int64   
 1   geo_level_2_id                          86868 non-null  int64   
 2   geo_level_3_id                          86868 non-null  int64   
 3   count_floors_pre_eq                     86868 non-null  int64   
 4   age                                     86868 non-null  int64   
 5   area_percentage                         86868 non-null  int64   
 6   height_percentage                       86868 non-null  int64   
 7   land_surface_condition                  86868 non-null  category
 8   foundation_type                         86868 non-null  category
 9   roof_type                               86868 non-null  category
 10  ground_floor_type                       

In [39]:
test_dataset = Pool(
    data=test_values,
    cat_features=df1_cats
)

predictions = new_catboost.predict(test_dataset)

In [40]:
sub_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [42]:
submission = pd.DataFrame(data=predictions, columns=sub_format.columns, index=sub_format.index)
submission.to_csv('submission_catboost_gs.csv')

In [43]:
submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,2
421793,3
