In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('..')

In [3]:
data = pd.read_csv('../data/interim/eq_dropped_columns_again.csv')

In [6]:
data.columns

Index(['Unnamed: 0', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'legal_ownership_status',
       'count_families', 'damage_grade'],
      dtype='object')

In [8]:
x = data.drop(columns=['damage_grade'])
y = data.damage_grade

In [10]:
numerical_df = data.select_dtypes(exclude=['object'])
categorical_df = data.select_dtypes(include=['object'])

In [12]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in x.columns if x[cname].nunique() < 800 and x[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in x.columns if x[cname].dtype in ['int64', 'float64']]

In [14]:
categorical_cols

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [16]:
numerical_cols

['Unnamed: 0',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'count_families']

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [52]:
from category_encoders import BaseNEncoder

In [32]:
from src.eda_first import summarize_dataframe
summarize_dataframe(X_train)

Unnamed: 0,Column,Data Type,Unique Values,Missing Values,Sample Unique Values
0,count_floors_pre_eq,int64,7,0,"[2, 3, 1, 4, 5]"
1,age,int64,42,0,"[30, 25, 10, 0, 80]"
2,area_percentage,int64,81,0,"[7, 5, 6, 4, 10]"
3,height_percentage,int64,26,0,"[7, 6, 5, 3, 2]"
4,land_surface_condition,object,3,0,"[t, n, o]"
5,foundation_type,object,5,0,"[r, u, i, w, h]"
6,roof_type,object,3,0,"[n, q, x]"
7,ground_floor_type,object,5,0,"[f, x, v, z, m]"
8,other_floor_type,object,4,0,"[q, x, s, j]"
9,position,object,4,0,"[s, t, o, j]"


In [54]:
#create numerical transformer


numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean')), 
                                  ('scaler', StandardScaler()) ])

#create categorical transformer
categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                            ])

base_encoder_columns = ['Division', 'Qualification', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers', 'Gender']
base_encoder = Pipeline(steps=[
    ('base_encoder', BaseNEncoder(cols=base_encoder_columns, base=3))
])

In [56]:
# Combine the transformations using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('base_name', base_encoder, base_encoder_columns),  # TargetEncoder for 'town'
    ('num', numerical_transformer, numerical_cols)])

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a RandomForestClassifier model
rf = RandomForestClassifier(n_estimators=150, random_state=42, max_depth=4)

# Model pipeline
rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('random_forest', rf)
                         ])

# Preprocessing of training data, fit model 
rf_pipe.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
rf_preds = rf_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_preds)
print('Accuracy for Random Forest Model:', accuracy)

# Detailed classification report
print('Classification Report:\n', classification_report(y_test, rf_preds))


ValueError: A given column is not a column of the dataframe