In [103]:
import sys

from fontTools.misc.cython import returns

sys.path.append('../src')
sys.path.append('../src/modeling') 

In [104]:
import pandas as pd
import numpy as np


In [None]:
def import_data(data_path):
    df = pd.read_csv(data_path)
    return df

In [105]:
raw_data_values = import_data('../data/train_values.csv')
train_labels = import_data('../data/train_labels.csv')
raw_data_values.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [106]:
categorical_columns = raw_data_values.select_dtypes(include='object').columns
numerical_columns = raw_data_values.select_dtypes(include='number').columns
#print(categorical_columns)
#print(numerical_columns)

**Missing values ('0') in 'age' and 'count_families'**

age:
0      26041

count_families:
0     20862

In [107]:
columns_to_clean = ['age', 'count_families']


In [108]:
y = train_labels['damage_grade']

#the building_id column is a unique and random identifier, therefore has to be removed from the features
X_raw = raw_data_values.drop('building_id', axis=1)
#X_raw = raw_data_values.drop(categorical_columns, axis=1)

X_raw.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [114]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


def create_numerical_transformer():
    return Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=0, strategy='mean')) 
    ])


def create_categorical_transformer():
    return Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

def create_preprocessor(numerical_transformer, categorical_transformer):
    return ColumnTransformer(transformers=[
        ('numerical', numerical_transformer, columns_to_clean),
        ('categorical', categorical_transformer, categorical_columns)
    ])


rf = RandomForestClassifier(criterion='entropy', n_estimators=200, max_depth=4, n_jobs=-1)

rf_pipeline = Pipeline(steps=[('preprocessor', create_preprocessor(create_numerical_transformer(), create_categorical_transformer())),
                              ('randomforestclassifier', rf)])

X_train, X_val, y_train, y_val = train_test_split(X_raw, y, test_size=0.2, random_state=42)

rf_pipeline.fit(X_train, y_train)

rf_predict = rf_pipeline.predict(X_val)

score = f1_score(y_val, rf_predict, average="micro")
print(f"F1-Score: {score}")

F1-Score: 0.5738186143780818


F1-Score: 0.5654150918056062

F1-Score: 0.5738953588764606

F1-Score: 0.5739721033748393

In [112]:
# Fitting using all train_values
rf_pipeline.fit(X_raw, y)

# Prediction for test_values
test_data = pd.read_csv('../data/test_values.csv')
rf_predict = rf_pipeline.predict(test_data)

In [113]:
def create_output(prediction: np.array, output_file_number: str):
    output = pd.DataFrame()
    output['building_id'] = test_data['building_id']
    output['damage_grade'] = prediction
    output.to_csv(f'../data/output_{output_file_number}.csv', index=False)
    
create_output(rf_predict, output_file_number='01')