In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import copy

from utils.tools import *

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator, ClassifierMixin

from category_encoders.target_encoder import TargetEncoder
from imblearn.over_sampling import KMeansSMOTE

from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from statsmodels.miscmodels.ordinal_model import OrderedModel
#from catboost import CatBoostClassifier

# 01 Problem Description

We're trying to predict the ordinal variable damage_grade which represent a level of damage to the building that was hit by the earthquake. There are 3 grades of the damage: 1, 2, 3.

#### Description of the dataset

The dataset mainly consists of information on the buildings' structure and their legal ownership. Each row in the dataset represents a specific building in the region that was hit by the Gorkha earthquake.

* building_id: column unique and random identifier;
* geo_level_1_id, geo_level_2_id, geo_level_3_id: geographic region in which building exists, from largest (level 1) to most specific sub-region (level 3). Possible vaules: level 1 $\in$ [0, 30], level 2 $\in$ [0, 1427], level 3 $\in$ [0, 12567];
* count_floors_pre_eq: number of floors in the building before earthquake;
* age: age of the building in years;
* area_percentage: (int) normalized area of the building footprint;
* height_percentage: (int) normalized height of the building footprint;
* land_surface_condition: (categorical) surface condition of the land where the building was built. Possible values are: 'n', 'o', 't';
* fundation_type: (categorical) type of fundation used while building. Possible values are: 'h', 'i', 'r', 'u', 'w';
* roof_type: (categorical) type of roof used while building. Possible values are: 'n', 'q', 'x';
* ground_floor_type: (categorical) type of the ground floor. Possible values are: 'f', 'm', 'v', 'x', 'z';
* other_floor_type: (categorical) type of constructions used in higher than the ground floors (except of roof). Possible values are: 'j', 'q', 's', 'x';
* position: (categorical) position of the building. Possible values are: 'j', 'o', 's', 't';
* plan_configuration: (categorical) building plan configuration. Possible values are: 'a', 'c', 'd', 'f', 'm', 'n', 'o', 'q', 's', 'u';
* has_superstructure_adobe_mud: (binary) flag variable that indicates if the superstructure was made in Adobe/Mud (adobe is a building material made from earth and organic material, it is one of the oldest material used for building structure). Notice that Adobe buildings are particularly susceptible to earthquake damage if they are not reinforced (source: Wikipedia)
* has_superstructure_mud_mortar_stone: (binary) flag variable that indicates if the superstructure was mad of Mud Mortar - Stone (it is a mixture of sand, water and clay);
* has_secondary_use_\<school, ... \>: (binary) flag variable that indicates if the building was used for <...> purposes. 
* count_families: (int) number of families that live in the building. 
* legal_ownership_status: (categorical) legal ownership status of the land where building was built. Possible values are: 'a', 'r', 'v', 'w';

The data exploration analysis has been performed in the data_exploration notebook.

#### Performance Metric

To measure the performance of our algorithms, we'll use the F1 score which balances the precision and recall of a classifier. The variant we will use will be the micro averaged F1 score.






# 02 Data loading

In [3]:
df=load_dataset()

one_hot_features=["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]

X=df.drop(columns=["building_id", "damage_grade"])
y=df["damage_grade"]

pp=Preprocessor(compress_cols=False, one_hot_cols=one_hot_features)
pp.fit(X,y)
X = pp.transform(X)

# 03 Baseline Models

We're gonna create a baseline for standard machine learning model. The data will have a minimal dummy preprocessing that will transform categorical features into numerical ones. No other processing will be applied.

In [6]:
# TRAIN THE MODELS
rf_baseline_model = RandomForestClassifier()
svc_baseline_model = LinearSVC()
lr_baseline_model = LogisticRegression()

rf_baseline_model.fit(X, y)
svc_baseline_model.fit(X, y)
lr_baseline_model.fit(X, y)

In [4]:
# LOAD THE TEST SET 
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# PREDICT
y_baseline_rf = rf_baseline_model.predict(X_test)
y_baseline_svc = svc_baseline_model.predict(X_test)
y_baseline_lr = lr_baseline_model.predict(X_test)

# SUBMISSION
X_sub_rf = pd.DataFrame(df_test['building_id'])
X_sub_svc = pd.DataFrame(df_test['building_id'])
X_sub_lr = pd.DataFrame(df_test['building_id'])

X_sub_rf['damage_grade'] = y_baseline_rf
X_sub_svc['damage_grade'] = y_baseline_svc
X_sub_lr['damage_grade'] = y_baseline_lr

X_sub_rf.to_csv("./prediction/RandomForest_baseline_sub.csv", index = False)
X_sub_svc.to_csv("./prediction/SupportVector_baseline_sub.csv", index = False)
X_sub_lr.to_csv("./prediction/LogisticRegression_baseline_sub.csv", index = False)

NameError: name 'rf_baseline_model' is not defined

# 04 Features Creation and Features Selection

We need to transform non-numerical features into numerical ones. The first step will be to create dummies of the categorical features and drop the building id. The building id should not take part in the classification since it's just an identification number.

We are gonna incorporate all the similar features 'has_superstructure_\*' and 'has_secondary_use_*' together through a sum operation

In [4]:
n_superstructure = np.array(X.loc[:, 'has_superstructure_adobe_mud':'has_superstructure_other']).sum(axis=-1)
X['number_of_different_superstructures'] = n_superstructure

n_secondary_uses = np.array(X.loc[:, 'has_secondary_use_agriculture':'has_secondary_use_other']).sum(axis=-1)
X['number_of_secondary_uses'] = n_secondary_uses


# 05 Features Normalization and Target Encoding

The scaling of the features, since many of them contains lots of outliers that can interfere with the scaling process, will be done with the RobustScaler.

On the geo level id features we perform a target encoding since their cardinality is too high

In [5]:
# SCALING
scaled_cols = ['age', 'area_percentage','count_floors_pre_eq' ,'height_percentage']
scaler = RobustScaler()
scaler.fit(X[scaled_cols])
X[scaled_cols] = scaler.transform(X[scaled_cols])

# TARGET ENCODING
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
target_encoder= TargetEncoder(cols=target_cols, min_samples_leaf=20, smoothing=10)
target_encoder.fit(X, y)
X = target_encoder.transform(X)

# 06 Grid-search for the classic models

#### Logistic Regression Grid-Search

In [None]:
lr_params = {
    'penalty' : ['l2', 'elasticnet'],
    'tol' : [1e-4, 1e-6],
    'C' : [0.5, 1.0, 2.0, 5.0],
    'random_state' : [42],
    'solver' : ['newton-cg', 'lbfgs', 'saga'],
    'max_iter' : [1000, 5000, 10000],
    'multi_class' : ['multinomial']
}

gs_lr = GridSearchCV(estimator = LogisticRegression(),
                    param_grid=lr_params, 
                    n_jobs=-1, 
                    verbose=2,
                    scoring='f1_micro',
                    refit=True)

gs_lr.fit(X, y)

print(f"BEST MODEL: {gs_lr.best_estimator_.get_params()}")
print(f"BEST MEAN SCORE: {gs_lr.best_score_}")

pickle.dump(gs_lr.best_estimator_, open("./models/LogisticRegression.pkl", "wb"))

lr = LogisticRegression(C=1.0, 
                        max_iter=10000, 
                        multi_class='multinomial', 
                        penalty='l2',
                        random_state=42,
                        solver='saga',
                        tol=1e-4)

#### SVM Grid-Search

In [None]:
svm_params = {
    'C' : [0.1, 1, 5],
    'kernel' : ['rbf', 'linear'],
    'tol' : [1e-4],
    'verbose' : [True],
    'random_state' : [42]
}

gs_svm = GridSearchCV(estimator = SVC(),
                      param_grid = svm_params, 
                      n_jobs=-1, 
                      verbose = 10, 
                      scoring='f1_micro',
                      refit=True)

gs_svm.fit(X, y)

print(f"BEST MODEL: {gs_svm.best_estimator_.get_params()}")
print(f"BEST MEAN SCORE: {gs_svm.best_score_}")

pickle.dump(gs_svm.best_estimator_, open("./models/SVM.pkl", "wb"))

SVM {C:1, kernel:rbf, tol:1e-4, random_state:42}

#### Random Forest Grid-Search

In [None]:
df=load_dataset()
X=df.drop(columns=['damage_grade', 'building_id'])
y=df['damage_grade']

#Required preprocessing: one hot encoding, target encoding
one_hot_cols=["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols=['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp=Preprocessor(compress_cols=True, target_cols=target_cols, one_hot_cols=one_hot_cols)
pp.fit(X,y)
X=pp.transform(X)


In [None]:
rf_params = {'n_estimators' : [100, 500, 1000], 
            'criterion' : ['gini', 'entropy'],
            'min_samples_leaf' : [1, 2],
            'min_samples_split' : [2, 5],
            'random_state' :  [42]
            }

gs_rf = GridSearchCV(estimator = RandomForestClassifier(), 
                    param_grid=rf_params, 
                    n_jobs=-1, 
                    verbose=2,
                    scoring='f1_micro',
                    refit=True)

gs_rf.fit(X, y)

print(f"BEST MODEL: {gs_rf.best_estimator_.get_params()}")
print(f"BEST MEAN SCORE: {gs_rf.best_score_}")

pickle.dump(gs_rf.best_estimator_, open("./models/RandomForestClassifier.pkl", "wb"))

rf_best_params={
    'criterion': 'entropy',
    'min_samples_leaf': 2,
    'min_samples_split': 5,
    'n_estimators': 1000
    }

# 08 New Algorithms

#### Light GBM

In [2]:
df=load_dataset()
X=df.drop(columns=['building_id', 'damage_grade'])
y=df['damage_grade']

#Required preprocessing: target encoding, identify categorical features
target_cols=['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
make_cat_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]

pp = Preprocessor(compress_cols=True, target_cols=target_cols, make_cat_cols=make_cat_cols)
pp.fit(X,y)
X=pp.transform(X)

cat_index = [X.columns.get_loc(name) for name in make_cat_cols]

In [None]:
lgbm_params = {
        "boosting_type":[ 'gbdt'],
        "num_leaves" :[20, 30],
        "max_depth" :[ -1],
        "learning_rate" :[0.1, 0.05, 0.01],
        "n_estimators":[5000],
        "objective" : ["multiclass"],      
        "feature_fraction" :[0.5, 0.65, 0.8],
        "min_child_weight" :[0.1, 0.5],
        "max_bin":[4096, 8192],
        "verbosity" :[1],
        "num_threads":[6],
        "seed":[ 42]
        }

gs_lgbm = GridSearchCV(estimator = LGBMClassifier(), 
                    param_grid=lgbm_params, 
                    verbose=2,
                    scoring='f1_micro')

gs_lgbm.fit(X, y, categorical_feature=cat_index)

print(f"BEST MODEL: {gs_lgbm.best_estimator_.get_params()}")
print(f"WITH MEAN SCORE: {gs_lgbm.best_score_}")

pickle.dump(gs_lgbm.best_estimator_, open("./models/LightGBM.pkl", "wb"))

lgbm_best_params={
    "boosting_type": 'gbdt',
    "num_leaves" :30,
    "max_depth" : -1,
    "learning_rate" :0.05,
    "n_estimators":5000,
    "objective" : "multiclass",      
    "feature_fraction" :0.5,
    "min_child_weight" :0.1,
    "max_bin": 8192,
    "verbosity" :1,
    "num_threads":6,
    "seed":42
}

#### CatBoost

In [3]:
df_cat = load_dataset()
X=df.drop(columns=['building_id', 'damage_grade'])
y=df['damage_grade']

#Required preprocessing: identify categorical features
make_cat_cols=['geo_level_1_id','geo_level_2_id','geo_level_3_id','land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status','count_floors_pre_eq', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other', 'number_of_different_superstructures', 'number_of_secondary_uses']

pp = Preprocessor(compress_cols=True, make_cat_cols=make_cat_cols)
pp.fit(X,y)
X=pp.transform(X)

In [None]:
catboost_params = {
    'iterations' : [1000, 2000, 5000, 7000],
    'learning_rate' : [0.02, 0.05, 0.07],
    'depth' : [9, 10, 12],
    'loss_function' : ['MultiClass'],
    'verbose' : [False],
    'eval_metric' : ['TotalF1'],
    'l2_leaf_reg' : [1, 2, 5, 9],
    'border_count' : [None, 11, 15],
    'task_type' : ['GPU']
}

gs_cat = GridSearchCV(estimator=CatBoostClassifier(),
                    param_grid=catboost_params, 
                    verbose=2,
                    scoring='f1_micro')

gs_cat.fit(X, y, verbose=10, cat_features=make_cat_cols)

print(f"BEST MODEL: {gs_cat.best_estimator_.get_params()}")
print(f"WITH MEAN SCORE: {gs_cat.best_score_}")

pickle.dump(gs_cat.best_estimator_, open("./models/CatBoostClassifier.pkl", "wb"))

Best: {iterations: 5000, learning_rate: 0.05, depth: 9, loss_function: Multiclass, verbose:false, eval_metric:TotalF1, l2_leaf_reg : 2, border_count:None}

# Confusion matrix

#### Logistic Regression

In [None]:
df_logit = load_dataset()
X = df_logit.drop(columns=["building_id", "damage_grade"])
y = df_logit["damage_grade"]

one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
scaled_cols = ['age', 'area_percentage','count_floors_pre_eq' ,'height_percentage']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, target_cols=target_cols, scaled_cols=scaled_cols)
pp.fit(X, y)
X = pp.transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)

log_model = pickle.load(open("./models/LogisticRegression.pkl"))
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_val)

print(f"GOT F1-SCORE OF: {f1_score(y_pred, y_val, average = 'micro')}")

get_confusion_matrix(y_val, y_pred)

#### SVM

In [None]:
df_svm = load_dataset()
X = df_svm.drop(columns=["building_id", "damage_grade"])
y = df_svm["damage_grade"]

one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
scaled_cols = ['age', 'area_percentage','count_floors_pre_eq' ,'height_percentage']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, target_cols=target_cols, scaled_cols=scaled_cols)
pp.fit(X, y)
X = pp.transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)

svm_model = pickle.load(open("./models/SVM.pkl"))
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_val)

print(f"GOT F1-SCORE OF: {f1_score(y_pred, y_val, average = 'micro')}")

get_confusion_matrix(y_val, y_pred)

#### Random Forest

In [None]:
df_rf = load_dataset()
X = df_rf.drop(columns=["building_id", "damage_grade"])
y = df_rf["damage_grade"]

one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, target_cols=target_cols)
pp.fit(X, y)
X = pp.transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)

rf_model = pickle.load(open("./models/RandomForestClassifier.pkl"))
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)

print(f"GOT F1-SCORE OF: {f1_score(y_pred, y_val, average = 'micro')}")

get_confusion_matrix(y_val, y_pred)

#### CatBoost

In [None]:
df_cat = load_dataset()
X = df_cat.drop(columns=['building_id', 'damage_grade'])
y = df_cat['damage_grade']

make_cat_cols=['geo_level_1_id','geo_level_2_id','geo_level_3_id','land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status','count_floors_pre_eq', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other', 'number_of_different_superstructures', 'number_of_secondary_uses']

pp = Preprocessor(compress_cols=True, make_cat_cols=make_cat_cols)
pp.fit(X,y)
X = pp.transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)

cat_model = pickle.load(open("./models/CatBoostClassifier.pkl"))
cat_model.fit(X_train, y_train, cat_features=make_cat_cols)
y_pred = cat_model.predict(X_val)

print(f"GOT F1-SCORE OF: {f1_score(y_pred, y_val, average = 'micro')}")

get_confusion_matrix(y_val, y_pred)

#### LightGBM

In [None]:
df=load_dataset()
X=df.drop(columns=['building_id', 'damage_grade'])
y=df['damage_grade']

make_cat_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, make_cat_cols=make_cat_cols, target_cols=target_cols)
pp.fit(X,y)
X=pp.transform(X)

cat_index = [X.columns.get_loc(name) for name in make_cat_cols]

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)          

lgbm_model = pickle.load(open("./models/LightGBM.pkl"))
lgbm_model.fit(X_train, y_train, categorical_feature=cat_index)

y_pred = lgbm_model.predict(X_val)

print(f"GOT F1-SCORE OF: {f1_score(y_pred, y_val, average = 'micro')}")

get_confusion_matrix(y_val, y_pred)

# Submission

The preprocessing is different from each model, for this reason, we're gonna make different sections for each different model.

#### Logistic Regression

In [3]:
# GET TRAINING SET 
df = load_dataset()
X_train = df.drop(columns=["building_id", "damage_grade"])
y = df["damage_grade"]

# FIT THE PREPROCESSOR
one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
scaled_cols = ['age', 'area_percentage','count_floors_pre_eq' ,'height_percentage']
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, scaled_cols=scaled_cols, target_cols=target_cols)
pp.fit(X_train, y)

# GET TEST SET AND TRANSFORM IT
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# LOAD THE MODEL AND PREDICT
lr_model = pickle.load(open("./models/LogisticRegression.pkl", "rb"))
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)

# SUBMISSION
X_sub_lr = pd.DataFrame(df_test['building_ids'])
X_sub_lr['damage_grade'] = y_pred_lr
X_sub_lr.to_csv(f"./prediction/LogisticRegression_sub.csv", index = False)



#### SVM

In [None]:
# GET TRAINING SET 
df = load_dataset()
X_train = df.drop(columns=["building_id", "damage_grade"])
y = df["damage_grade"]

# FIT THE PREPROCESSOR
one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
scaled_cols = ['age', 'area_percentage','count_floors_pre_eq' ,'height_percentage']
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, scaled_cols=scaled_cols, target_cols=target_cols)
pp.fit(X_train, y)

# GET TEST SET AND TRANSFORM IT
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# LOAD THE MODEL AND PREDICT
svm_model = pickle.load(open("./models/SVM.pkl", "rb"))
y_pred_svm = svm_model.predict(X_test)
y_pred_proba_svm = svm_model.predict_proba(X_test)

# SUBMISSION
X_sub_svm = pd.DataFrame(df_test['building_ids'])
X_sub_svm['damage_grade'] = y_pred_svm
X_sub_svm.to_csv(f"./prediction/SVM_sub.csv", index = False)

#### Random Forest Classifier

In [None]:
# GET TRAINING SET 
df_rf = load_dataset()
X_train = df_rf.drop(columns=["building_id", "damage_grade"])
y = df_rf["damage_grade"]

# FIT THE PREPROCESSOR
one_hot_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]
target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, one_hot_cols=one_hot_cols, target_cols=target_cols)
pp.fit(X_train, y)

# GET TEST SET AND TRANSFORM IT
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# LOAD THE MODEL AND PREDICT
rf_model = pickle.load(open("./models/RandomForestClassifier.pkl", "rb"))
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)

# SUBMISSION
X_sub_rf = pd.DataFrame(df_test['building_ids'])
X_sub_rf['damage_grade'] = y_pred_rf
X_sub_rf.to_csv(f"./prediction/RandomForestClassifier_sub.csv", index = False)

#### LightGBM

In [None]:
# GET TRAINING SET 
df = load_dataset()
X_train = df.drop(columns=["building_id", "damage_grade"])
y = df["damage_grade"]

# FIT THE PREPROCESSOR
make_cat_cols = ["legal_ownership_status","land_surface_condition",
                    "foundation_type","roof_type","ground_floor_type","other_floor_type","position","plan_configuration"]

target_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

pp = Preprocessor(compress_cols=True, target_cols=target_cols, make_cat_cols=make_cat_cols)
pp.fit(X_train, y)

# GET TEST SET AND TRANSFORM IT
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# LOAD THE MODEL AND PREDICT
lgbm_model = pickle.load(open("./models/LightGBM.pkl", "rb"))
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_proba_lgbm = lgbm_model.predict_proba(X_test)

# SUBMISSION
X_sub_lgbm = pd.DataFrame(df_test['building_ids'])
X_sub_lgbm['damage_grade'] = y_pred_lgbm
X_sub_lgbm.to_csv(f"./prediction/LightGBM_sub.csv", index = False)


#### CatBoost

In [None]:
# GET TRAINING SET 
df = load_dataset()
X_train = df.drop(columns=["building_id", "damage_grade"])
y = df["damage_grade"]

# FIT THE PREPROCESSOR
make_cat_cols = ['geo_level_1_id','geo_level_2_id','geo_level_3_id','land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status','count_floors_pre_eq', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other', 'number_of_different_superstructures', 'number_of_secondary_uses']

pp = Preprocessor(compress_cols=True, make_cat_cols=make_cat_cols)
pp.fit(X_train, y)

# GET TEST SET AND TRANSFORM IT
df_test = load_dataset(train = False)
X_test = df_test.drop(columns=['building_id'])
X_test = pp.transform(X_test)

# LOAD THE MODEL AND PREDICT
cat_model = pickle.load(open("./models/CatBoost.pkl", "rb"))
y_pred_cat = cat_model.predict(X_test)
y_pred_proba_cat = cat_model.predict_proba(X_test)

# SUBMISSION
X_sub_cat = pd.DataFrame(df_test['building_ids'])
X_sub_cat['damage_grade'] = y_pred_cat
X_sub_cat.to_csv(f"./prediction/CatBoost_sub.csv", index = False)


#### Ensembles

In [None]:
y_pred_proba_ensembles = []

y_pred_proba_ensembles.append(y_pred_proba_rf)
y_pred_proba_ensembles.append(y_pred_proba_lgbm)
y_pred_proba_ensembles.append(y_pred_proba_cat)

y_pred_proba_ensembles = np.array(y_pred_proba_ensembles)

y_pred_ensembles = majority_vote(y_pred_proba_ensembles, [0.8, 1., 1.2])

In [None]:
df_test=load_dataset(train=False)
X_sub_ens = pd.DataFrame(df_test['building_id'])
X_sub_ens['damage_grade'] = y_pred_ensembles

X_sub_ens.to_csv(f"./prediction/Ensembles_sub.csv", index = False)