In [61]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df1 = pd.read_csv('dataset_v1').set_index('building_id')
df2 = pd.read_csv('dataset_v2').drop(columns=['Unnamed: 0']).set_index('building_id')
df3 = pd.read_csv('dataset_v3').set_index('building_id')

train_labels = pd.read_csv('train_labels.csv').set_index('building_id')

In [63]:
selected_features = [
    'age',
    'area_percentage',
    'height_percentage',
    'has_superstructure_adobe_mud',
    'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag',
    'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick',
    'has_superstructure_timber',
    'has_superstructure_bamboo',
    'has_superstructure_rc_non_engineered',
    'has_superstructure_rc_engineered'
]

df2_selected_features = [
    'land_surface_condition_C0',
    'land_surface_condition_C1',
    'foundation_type_C0',
    'foundation_type_C1',
    'foundation_type_C2',
    'ground_floor_type_C0',
    'ground_floor_type_C1',
    'ground_floor_type_C2',
    'other_floor_type_C0',
    'other_floor_type_C1',
    'other_floor_type_C2',
    'position_C0',
    'position_C1',
    'position_C2',
    'plan_configuration_C0',
    'plan_configuration_C1',
    'plan_configuration_C2',
    'plan_configuration_C3'
]
df2_subset = df2.loc[: , df2_selected_features]
train_subset = df1.join(df2_subset).loc[:, selected_features + df2_selected_features]
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 31 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   age                                     260601 non-null  int64
 1   area_percentage                         260601 non-null  int64
 2   height_percentage                       260601 non-null  int64
 3   has_superstructure_adobe_mud            260601 non-null  bool 
 4   has_superstructure_mud_mortar_stone     260601 non-null  bool 
 5   has_superstructure_stone_flag           260601 non-null  bool 
 6   has_superstructure_cement_mortar_stone  260601 non-null  bool 
 7   has_superstructure_mud_mortar_brick     260601 non-null  bool 
 8   has_superstructure_cement_mortar_brick  260601 non-null  bool 
 9   has_superstructure_timber               260601 non-null  bool 
 10  has_superstructure_bamboo               260601 non-null  bool 


In [64]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [65]:
pipe_random_forest = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=2018))

pipe_random_forest

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2018))])

In [66]:
param_grid = {
    'randomforestclassifier__n_estimators': [50, 100],
    'randomforestclassifier__min_samples_leaf': [1, 5]
}

gs = GridSearchCV(pipe_random_forest, param_grid, cv=5)

In [67]:
gs.fit(train_subset, train_labels.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2018))]),
             param_grid={'randomforestclassifier__min_samples_leaf': [1, 5],
                         'randomforestclassifier__n_estimators': [50, 100]})

In [68]:
gs.best_params_

{'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__n_estimators': 50}

Vemos como es el score para este modelo con el mismo subset de datos

In [69]:
from sklearn.metrics import f1_score

in_sample_predictions = gs.predict(train_subset)
f1_score(train_labels, in_sample_predictions, average='micro')

0.6257804075962871

In [70]:
test_values = pd.read_csv('encoded_test_values.csv', index_col='building_id')
test_subset = test_values[selected_features + df2_selected_features]

for feature in (selected_features + df2_selected_features):
    if (feature.startswith('has_superstructure')):
        test_subset[feature] = test_subset[feature].astype('bool')
test_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86868 entries, 300051 to 501372
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   age                                     86868 non-null  int64
 1   area_percentage                         86868 non-null  int64
 2   height_percentage                       86868 non-null  int64
 3   has_superstructure_adobe_mud            86868 non-null  bool 
 4   has_superstructure_mud_mortar_stone     86868 non-null  bool 
 5   has_superstructure_stone_flag           86868 non-null  bool 
 6   has_superstructure_cement_mortar_stone  86868 non-null  bool 
 7   has_superstructure_mud_mortar_brick     86868 non-null  bool 
 8   has_superstructure_cement_mortar_brick  86868 non-null  bool 
 9   has_superstructure_timber               86868 non-null  bool 
 10  has_superstructure_bamboo               86868 non-null  bool 
 11  has_super

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [71]:
predictions = gs.predict(test_subset)

In [72]:
sub_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [73]:
submission = pd.DataFrame(data=predictions, columns=sub_format.columns, index=sub_format.index)

In [74]:
submission.to_csv('submission_2.csv')

Pruebo incluyendo geo_level_1_id

In [75]:
new_selected_features = selected_features + ['geo_level_1_id']

train_subset = df1.join(df2_subset).loc[:, new_selected_features + df2_selected_features]
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 32 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   age                                     260601 non-null  int64
 1   area_percentage                         260601 non-null  int64
 2   height_percentage                       260601 non-null  int64
 3   has_superstructure_adobe_mud            260601 non-null  bool 
 4   has_superstructure_mud_mortar_stone     260601 non-null  bool 
 5   has_superstructure_stone_flag           260601 non-null  bool 
 6   has_superstructure_cement_mortar_stone  260601 non-null  bool 
 7   has_superstructure_mud_mortar_brick     260601 non-null  bool 
 8   has_superstructure_cement_mortar_brick  260601 non-null  bool 
 9   has_superstructure_timber               260601 non-null  bool 
 10  has_superstructure_bamboo               260601 non-null  bool 


In [76]:
gs.fit(train_subset, train_labels.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2018))]),
             param_grid={'randomforestclassifier__min_samples_leaf': [1, 5],
                         'randomforestclassifier__n_estimators': [50, 100]})

In [77]:
gs.best_params_

{'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__n_estimators': 100}

In [78]:
in_sample_preds = gs.predict(train_subset)
in_sample_score = f1_score(train_labels, in_sample_preds, average='micro')
in_sample_score

0.7113978841217032

In [79]:
test_subset = test_values[new_selected_features + df2_selected_features]

for feature in (selected_features + df2_selected_features):
    if (feature.startswith('has_superstructure')):
        test_subset[feature] = test_subset[feature].astype('bool')
test_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86868 entries, 300051 to 501372
Data columns (total 32 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   age                                     86868 non-null  int64
 1   area_percentage                         86868 non-null  int64
 2   height_percentage                       86868 non-null  int64
 3   has_superstructure_adobe_mud            86868 non-null  bool 
 4   has_superstructure_mud_mortar_stone     86868 non-null  bool 
 5   has_superstructure_stone_flag           86868 non-null  bool 
 6   has_superstructure_cement_mortar_stone  86868 non-null  bool 
 7   has_superstructure_mud_mortar_brick     86868 non-null  bool 
 8   has_superstructure_cement_mortar_brick  86868 non-null  bool 
 9   has_superstructure_timber               86868 non-null  bool 
 10  has_superstructure_bamboo               86868 non-null  bool 
 11  has_super

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [80]:
prediction = gs.predict(test_subset)

In [81]:
submission = pd.DataFrame(data=prediction, columns=sub_format.columns, index=sub_format.index)
submission.to_csv('submission_4.csv')