In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df4 = pd.read_csv('dataset_v4.csv').set_index('building_id')

df1 = pd.read_csv('dataset_v1').set_index('building_id')


train_labels = pd.read_csv('train_labels.csv').set_index('building_id')

In [3]:
df1_cats = [
    'land_surface_condition',
    'foundation_type',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
]

for cat in df1_cats:
    df1[cat] = df1[cat].astype('category')

In [4]:
# df1 es mi set de entrenamiento
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  int64   
 1   geo_level_2_id                          260601 non-null  int64   
 2   geo_level_3_id                          260601 non-null  int64   
 3   count_floors_pre_eq                     260601 non-null  int64   
 4   age                                     260601 non-null  int64   
 5   area_percentage                         260601 non-null  int64   
 6   height_percentage                       260601 non-null  int64   
 7   land_surface_condition                  260601 non-null  category
 8   foundation_type                         260601 non-null  category
 9   roof_type                               260601 non-null  category
 10  ground_floor_type          

In [None]:
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier

In [None]:
x, y = df1.iloc[:,:-1], df1.iloc[:, -1]

Pruebo inicialmente entrenando con un subset del dataset de entrenamiento

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2048)

In [None]:
x_train.head()

In [None]:
train_dataset = Pool(data=x_train,
                     label=y_train,
                     cat_features=df1_cats)

eval_dataset = Pool(data=x_test,
                    label=y_test,
                    cat_features=df1_cats)

catboost_model = CatBoostClassifier(
    learning_rate=1,
    depth=2,
    loss_function='MultiClass'
)
catboost_model.fit(train_dataset)
preds = catboost_model.predict(eval_dataset)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, preds, average='micro')

Ahora pruebo con el dataset de entrenamiento completo

In [None]:
test_values = pd.read_csv('encoded_test_values.csv', index_col='building_id')

In [None]:
train_dataset = Pool(data=x,
                     label=y,
                     cat_features=df1_cats)

catboost_model = CatBoostClassifier(
    learning_rate=0.3,
    depth=5,
    loss_function='MultiClass'
)
catboost_model.fit(train_dataset, verbose=False)
preds = catboost_model.predict(train_dataset)
f1_score(y, preds, average='micro')

In [None]:
test_values = pd.read_csv('test_values.csv').set_index('building_id')

for cat in df1_cats:
    test_values[cat] = test_values[cat].astype('category')
test_values.info()

In [None]:
test_dataset = Pool(data=test_values,
                    cat_features=df1_cats)

predictions = catboost_model.predict(test_dataset)

In [None]:
sub_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [None]:
submission = pd.DataFrame(data=predictions, columns=sub_format.columns, index=sub_format.index)
submission.to_csv('submission_catboost.csv')