In [25]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('..')

In [26]:
data = pd.read_csv('../data/interim/all_train_data.csv')

In [27]:
data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [28]:
pct = np.percentile(data.loc[:, 'area_percentage'].fillna(np.mean(data.loc[:, 'area_percentage'])), 97)
print(pct)
print(data.shape)
data = data.loc[data.loc[:, 'area_percentage'] < pct]
print(data.shape)

18.0
(260601, 39)
(252139, 39)


In [29]:
pct = np.percentile(data.loc[:, 'height_percentage'].fillna(np.mean(data.loc[:, 'height_percentage'])), 97)
print(pct)
print(data.shape)
data = data.loc[data.loc[:, 'height_percentage'] < pct]
print(data.shape)

9.0
(252139, 39)
(240799, 39)


In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class GeoInteractionTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to create geo interaction terms by concatenating the geo-level IDs.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_new = X.copy()
        # Concatenate geo_level_1_id, geo_level_2_id, and geo_level_3_id
        X_new['geo1_geo2'] = X_new['geo_level_1_id'].astype(str) + '_' + X_new['geo_level_2_id'].astype(str)
        X_new['geo1_geo3'] = X_new['geo_level_1_id'].astype(str) + '_' + X_new['geo_level_3_id'].astype(str)
        X_new['geo2_geo3'] = X_new['geo_level_2_id'].astype(str) + '_' + X_new['geo_level_3_id'].astype(str)
        X_new['geo_all'] = (
            X_new['geo_level_1_id'].astype(str) + '_' + 
            X_new['geo_level_2_id'].astype(str) + '_' +
            X_new['geo_level_3_id'].astype(str)
        )
        # Return the entire dataframe including original and new columns
        return X_new



In [31]:
numerical_df = data.select_dtypes(exclude=['object'])
numerical_df.describe()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
count,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,...,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0,240799.0
mean,13.853758,702.139207,6276.92436,2.053489,25.865037,7.418868,5.160956,0.081072,0.797292,0.035121,...,0.024477,0.005075,0.000544,0.000174,0.000939,0.000133,8.7e-05,8.3e-05,0.004842,2.254665
std,7.938389,409.412468,3646.519437,0.624599,72.641948,2.973722,1.460206,0.272946,0.402018,0.184085,...,0.154525,0.071057,0.023318,0.013206,0.030621,0.011527,0.009338,0.009113,0.069417,0.603874
min,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,7.0,355.0,3098.5,2.0,10.0,5.0,4.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,12.0,706.0,6289.0,2.0,15.0,7.0,5.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,21.0,1050.0,9440.0,2.0,30.0,9.0,6.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,30.0,1427.0,12567.0,9.0,995.0,17.0,8.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [32]:
from scipy.stats import skew

def get_right_skewed_columns(df, skew_threshold=0.5):
    """
    Returns the names of columns that are right-skewed based on the skewness value, excluding binary columns.
    
    Parameters:
    - df: The input DataFrame (numerical columns only).
    - skew_threshold: The skewness threshold above which a column is considered right-skewed (default is 0.5).
    
    Returns:
    - List of column names that are right-skewed.
    """
    right_skewed_columns = []
    
    # Iterate through each column in the dataframe
    for col in df.columns:
        # Check if the column has more than 2 unique values (to avoid binary columns)
        if df[col].nunique() > 2:
            # Calculate skewness for each column
            col_skewness = skew(df[col].dropna())  # Drop NaN values to avoid issues
            
            # Check if the skewness is above the specified threshold (indicating right-skewness)
            if col_skewness > skew_threshold:
                right_skewed_columns.append(col)
    
    return right_skewed_columns


In [33]:
# # Select numerical columns
# numerical_df = data.select_dtypes(exclude=['object'])

# # Get the right-skewed columns
right_skewed_cols = get_right_skewed_columns(numerical_df)

print("Right-skewed columns:", right_skewed_cols)


Right-skewed columns: ['age', 'area_percentage', 'count_families']


In [34]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Custom transformer for the age-based transformation
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, age_column='age'):
        self.age_column = age_column
        self.percentile_ = None

    def fit(self, X, y=None):
        # Calculate the 99th percentile of the 'age' column and store it
        self.percentile_ = np.percentile(X[self.age_column].fillna(np.mean(X[self.age_column])), 99)
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Add a new 'old' column to indicate if the age exceeds the 99th percentile
        X_copy['old'] = np.where(X_copy[self.age_column] >= self.percentile_, 1, 0)
        
        # Cap the age to 100 where the 'old' column is 1
        X_copy.loc[X_copy['old'] == 1, self.age_column] = 100
        
        return X_copy


In [35]:
x = data.drop(columns=['damage_grade'])
y = data.damage_grade

In [36]:
y = y.replace({1: 0, 2: 1, 3: 2})

In [37]:
y

0         2
1         1
2         2
3         1
5         1
         ..
260596    1
260597    2
260598    2
260599    1
260600    2
Name: damage_grade, Length: 240799, dtype: int64

In [38]:
y.value_counts()

damage_grade
1    137372
2     82375
0     21052
Name: count, dtype: int64

In [39]:
#x.old.value_counts()

In [40]:
#numerical_df = data.select_dtypes(exclude=['object'])
#categorical_df = data.select_dtypes(include=['object'])

In [41]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in x.columns if  x[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in x.columns if x[cname].dtype in ['int32', 'int64', 'float64']]

In [42]:
categorical_cols

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [43]:
#plt.hist(numerical_df.age)

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

In [45]:
y_train.value_counts()

damage_grade
1    96394
2    57400
0    14765
Name: count, dtype: int64

In [46]:
y_test.value_counts()

damage_grade
1    40978
2    24975
0     6287
Name: count, dtype: int64

In [49]:
from category_encoders import BaseNEncoder

ModuleNotFoundError: No module named 'category_encoders'

In [50]:
%pip install category_encoders

Collecting category_encoders
  Using cached category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Using cached category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.4
Note: you may need to restart the kernel to use updated packages.


In [69]:
#create numerical transformer

numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean'))])

#create categorical transformer
#categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),
#                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
#                                            ])

base_encoder_columns = ['land_surface_condition', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

base_encoder = Pipeline(steps=[
    ('base_encoder', BaseNEncoder(cols=base_encoder_columns, base=5))
])

age_transformer = Pipeline(steps=[
    ('age_transform', AgeTransformer(age_column='age'))  # Apply age transformation
])

In [71]:
# Combine the transformations using ColumnTransformer
#preprocessor = ColumnTransformer(transformers=[
#    ('base_name', base_encoder, base_encoder_columns)])  # TargetEncoder for 'town'
#    ('num', numerical_transformer, numerical_cols)])

In [73]:
# Updated ColumnTransformer with the log transformer
preprocessor = ColumnTransformer(transformers=[
    ('base_name', base_encoder, base_encoder_columns),  # BaseNEncoder for categorical columns
    ('age_transform', age_transformer, ['age']),  # Custom transformer for the 'age' column
    ('num', 'passthrough', numerical_cols),  # Pass numerical columns through without transformation
    ('interaction', interaction_transformer, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'height_percentage', 'area_percentage']),  # Interaction terms
    ('log_transform', log_transformer, right_skewed_cols)  # Apply log transformation to specified columns
])

In [75]:
# from lightgbm import LGBMClassifier

# # LightGBM for Multiclass Classification
# lgbm = LGBMClassifier(
#     n_estimators=800,
#     learning_rate=0.2669112505018992,
#     max_depth=5,
#     random_state=42,
#     reg_lambda=1.2259716591605452,  # L2 regularization
#     subsample=0.704976942819638,    # Subsample ratio of the training instances
#     colsample_bytree=0.9,           # Subsample ratio of columns when constructing each tree
#     min_child_weight=4,             # Equivalent of min_data_in_leaf in LightGBM
#     reg_alpha=0.14170716330946964,  # L1 regularization term
#     objective='multiclass',         # Objective for multiclass classification
#     metric='multi_logloss',         # Metric used for multiclass classification
#     num_class=3                     # Specify the number of classes in the target
# )



# rf_pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),  # Step 1: Preprocessing
#     ('xgboost', lgbm)  # Step 3: Model training
# ])

# # Preprocessing of training data, fit model 
# #rf_pipe.fit(X_train, y_train)

# # Preprocessing of training data, fit model after upsampling!
# rf_pipe.fit(x, y)

# # Preprocessing of validation data, get predictions
# #rf_preds = rf_pipe.predict(X_test)

# # Evaluate the model
# #accuracy = accuracy_score(y_test, rf_preds)
# #print('Accuracy for XGBoost:', accuracy)

# # Detailed classification report
# #print('Classification Report:\n', classification_report(y_test, rf_preds))

In [77]:
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
# XGBoost
xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.2669112505018992,
    max_depth=5,
    random_state=42,
    reg_lambda=1.2259716591605452,
    subsample=0.704976942819638,
    colsample_bytree=0.9,
    min_child_weight=4,
    alpha= 0.14170716330946964,    # Added L1 regularization
    eval_metric='mlogloss',  # Consider custom loss for ordinal
    objective='multi:softmax',  # Using softmax but can tweak for ordinal
    num_class=3  # Assuming 3 ordinal classes
)

rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Step 1: Preprocessing
    ('xgboost', xgb)  # Step 3: Model training
])

# Preprocessing of training data, fit model 
#rf_pipe.fit(X_train, y_train)

# Preprocessing of training data, fit model after upsampling!
rf_pipe.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
rf_preds = rf_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_preds)
print('Accuracy for XGBoost:', accuracy)

# Detailed classification report
print('Classification Report:\n', classification_report(y_test, rf_preds))

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


Accuracy for XGBoost: 0.7431112308832446
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.51      0.58      6775
           1       0.74      0.84      0.79     42102
           2       0.76      0.64      0.69     25338

    accuracy                           0.74     74215
   macro avg       0.72      0.66      0.69     74215
weighted avg       0.74      0.74      0.74     74215



In [78]:
# Preprocessing of validation data, get predictions
rf_preds = rf_pipe.predict(X_train)

# Evaluate the model
accuracy = accuracy_score(y_train, rf_preds)
print('Accuracy for XGBoost:', accuracy)

# Detailed classification report
print('Classification Report:\n', classification_report(y_train, rf_preds))

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


Accuracy for XGBoost: 0.799130319287162
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.64      0.71     15889
           1       0.79      0.89      0.84     98747
           2       0.81      0.70      0.75     58531

    accuracy                           0.80    173167
   macro avg       0.80      0.74      0.77    173167
weighted avg       0.80      0.80      0.80    173167



In [483]:
test_data = pd.read_csv('../data/raw/test_values.csv')

In [233]:
X_test_final = test_data.building_id

In [235]:
X_test_final

0         300051
1          99355
2         890251
3         745817
4         421793
          ...   
86863     310028
86864     663567
86865    1049160
86866     442785
86867     501372
Name: building_id, Length: 86868, dtype: int64

In [237]:
rf_preds = rf_pipe.predict(test_data)
rf_preds = pd.Series(rf_preds)
rf_preds 

0        2
1        1
2        1
3        0
4        2
        ..
86863    1
86864    1
86865    1
86866    1
86867    1
Length: 86868, dtype: int64

In [239]:
df_concatenated = pd.concat([X_test_final, rf_preds], axis=1)
df_concatenated
df_concatenated = df_concatenated.rename(columns={0: 'damage_grade'})
df_concatenated
df_concatenated['damage_grade'] = df_concatenated['damage_grade'].replace({0: 1, 1: 2, 2: 3})
df_concatenated.to_csv('../data/processed/submission2.csv', index=False)