In [149]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('..')

In [156]:
data = pd.read_csv('../data/interim/eq_2.csv')

In [158]:
data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'legal_ownership_status',
       'count_families', 'damage_grade'],
      dtype='object')

In [320]:
x = data.drop(columns=['damage_grade'])
y = data.damage_grade

In [322]:
y = y.replace({1: 0, 2: 1, 3: 2})

In [324]:
y

0         2
1         1
2         2
3         1
4         2
         ..
260596    1
260597    2
260598    2
260599    1
260600    2
Name: damage_grade, Length: 260601, dtype: int64

In [326]:
y.value_counts()

damage_grade
1    148259
2     87218
0     25124
Name: count, dtype: int64

In [328]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 22 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  int64 
 1   geo_level_2_id                          260601 non-null  int64 
 2   geo_level_3_id                          260601 non-null  int64 
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-null 

In [330]:
numerical_df = data.select_dtypes(exclude=['object'])
categorical_df = data.select_dtypes(include=['object'])

In [332]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in x.columns if  x[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in x.columns if x[cname].dtype in ['int64', 'float64']]

In [334]:
categorical_cols

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [336]:
numerical_cols 

['geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'count_families']

In [338]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

In [340]:
y_train.value_counts()

damage_grade
1    103845
2     61052
0     17523
Name: count, dtype: int64

In [341]:
y_test.value_counts()

damage_grade
1    44414
2    26166
0     7601
Name: count, dtype: int64

In [344]:
from category_encoders import BaseNEncoder

In [346]:
from src.eda_first import summarize_dataframe
summarize_dataframe(X_train)

Unnamed: 0,Column,Data Type,Unique Values,Missing Values,Sample Unique Values
0,geo_level_1_id,int64,31,0,"[3, 8, 0, 17, 26]"
1,geo_level_2_id,int64,1402,0,"[574, 1343, 396, 858, 421]"
2,geo_level_3_id,int64,11221,0,"[1256, 5698, 8099, 12252, 1265]"
3,count_floors_pre_eq,int64,7,0,"[2, 3, 1, 4, 5]"
4,age,int64,42,0,"[30, 25, 10, 0, 80]"
5,area_percentage,int64,81,0,"[7, 5, 6, 4, 10]"
6,height_percentage,int64,26,0,"[7, 6, 5, 3, 2]"
7,land_surface_condition,object,3,0,"[t, n, o]"
8,foundation_type,object,5,0,"[r, u, i, w, h]"
9,roof_type,object,3,0,"[n, q, x]"


In [347]:
#create numerical transformer

numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean'))])

#create categorical transformer
#categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),
#                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
#                                            ])

base_encoder_columns = ['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

base_encoder = Pipeline(steps=[
    ('base_encoder', BaseNEncoder(cols=base_encoder_columns, base=3))
])

In [350]:
# Combine the transformations using ColumnTransformer
#preprocessor = ColumnTransformer(transformers=[
#    ('base_name', base_encoder, base_encoder_columns)])  # TargetEncoder for 'town'
#    ('num', numerical_transformer, numerical_cols)])

In [352]:
preprocessor = ColumnTransformer(transformers=[
    ('base_name', base_encoder, base_encoder_columns),  # TargetEncoder for 'town'
    ('num', 'passthrough', numerical_cols)  # Pass numerical columns through without transformation
])

In [354]:
data

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,legal_ownership_status,count_families,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,d,1,1,0,0,0,0,v,1,3
1,8,900,2812,2,10,8,7,o,r,n,...,d,0,1,0,0,0,0,v,1,2
2,21,363,8973,2,10,5,5,t,r,n,...,d,0,1,0,0,0,0,v,1,3
3,22,418,10694,2,10,6,5,t,r,n,...,d,0,1,0,1,0,0,v,1,2
4,11,131,1488,3,30,8,9,t,r,n,...,d,1,0,0,0,0,0,v,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,q,0,1,0,0,0,0,v,1,2
260597,17,715,2060,2,0,6,5,t,r,n,...,d,0,1,0,0,0,0,v,1,3
260598,17,51,8163,3,55,6,7,t,r,q,...,d,0,1,0,0,0,0,v,1,3
260599,26,39,1851,2,10,14,6,t,r,x,...,d,0,0,1,0,0,0,v,1,2


Accuracy for Random Forest Model: 0.5764316138192144
Classification Report:
               precision    recall  f1-score   support

           1       0.61      0.22      0.32      7601
           2       0.58      0.98      0.72     44414
           3       0.00      0.00      0.00     26166

    accuracy                           0.58     78181
   macro avg       0.40      0.40      0.35     78181
weighted avg       0.39      0.58      0.44     78181



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [356]:
from xgboost import XGBClassifier
# XGBoost
xgb = XGBClassifier(
    n_estimators=339,
    learning_rate=0.2669112505018992,
    max_depth=5,
    random_state=42,
    reg_lambda=1.2259716591605452,
    subsample=0.704976942819638,
    colsample_bytree=0.9,
    min_child_weight=4,
    alpha= 0.14170716330946964,    # Added L1 regularization
    #reg_lambda=1,   # Added L2 regularization (can also be increased)
    eval_metric='aucpr'
)

rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Step 1: Preprocessing
    ('xgboost', xgb)  # Step 3: Model training
])

# Preprocessing of training data, fit model 
#rf_pipe.fit(X_train, y_train)

# Preprocessing of training data, fit model after upsampling!
rf_pipe.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
rf_preds = rf_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_preds)
print('Accuracy for XGBoost:', accuracy)

# Detailed classification report
print('Classification Report:\n', classification_report(y_test, rf_preds))

Accuracy for XGBoost: 0.7328251109604635
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.48      0.57      7601
           1       0.73      0.85      0.78     44414
           2       0.75      0.61      0.67     26166

    accuracy                           0.73     78181
   macro avg       0.73      0.64      0.67     78181
weighted avg       0.73      0.73      0.73     78181



In [224]:
X_test

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,legal_ownership_status,count_families
111801,10,1224,11128,1,20,7,3,t,r,n,...,s,d,0,1,0,0,0,0,v,1
91817,22,1101,12483,2,60,7,6,t,r,n,...,s,d,0,1,0,0,0,0,v,1
251661,26,39,11440,2,50,6,5,t,r,n,...,s,d,0,0,0,0,0,0,v,1
205479,8,1073,2743,3,50,7,6,o,r,q,...,s,d,0,1,0,1,0,0,v,1
22618,6,706,7959,2,15,8,4,t,r,n,...,s,d,0,1,0,0,0,0,v,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17267,7,322,2843,2,10,6,6,t,r,n,...,s,d,1,1,0,1,0,0,v,1
66715,0,540,2361,2,20,7,5,t,r,n,...,s,d,0,1,0,1,0,0,v,1
243127,11,155,12428,2,45,8,8,t,r,n,...,s,d,0,1,0,1,0,0,v,1
2108,26,1401,8525,3,40,6,9,t,r,n,...,t,d,0,1,0,0,0,0,v,1
