In [2]:
import pandas as pd

df = pd.read_csv("train_values.csv")

categorical_columns = [
     "roof_type               ",
     "land_surface_condition  ",
     "legal_ownership_status  ",
     "other_floor_type        ",
     "position                ",
     "foundation_type         ",
     "ground_floor_type       ",
     "count_floors_pre_eq     ",
     "count_families          ",
     "plan_configuration      " 
]

bool_columns = [
      "has_superstructure_adobe_mud                ",
      "has_superstructure_bamboo                   ",
      "has_secondary_use_rental                    ", 
      "has_secondary_use_hotel                     ", 
      "has_secondary_use                           ",  
      "has_secondary_use_agriculture               ", 
      "has_superstructure_other                    ", 
      "has_superstructure_rc_engineered            ",  
      "has_superstructure_rc_non_engineered        ",  
      "has_superstructure_cement_mortar_stone      ",  
      "has_superstructure_timber                   ",  
      "has_superstructure_cement_mortar_brick      ",  
      "has_superstructure_mud_mortar_brick         ",  
      "has_superstructure_mud_mortar_stone         ",  
      "has_superstructure_stone_flag               ",  
      "has_secondary_use_institution               ",  
      "has_secondary_use_health_post               ",  
      "has_secondary_use_other                     ",  
      "has_secondary_use_use_police                ",  
      "has_secondary_use_gov_office                ",  
      "has_secondary_use_school                    ",  
      "has_secondary_use_industry                  "  
]
categorical_columns = list(map(lambda x: x.strip(), categorical_columns))
bool_columns = list(map(lambda x: x.strip(), bool_columns))

df[categorical_columns] = df[categorical_columns].astype("category")
df[bool_columns] = df[bool_columns].astype("bool")

## Random Forest

### base line model with bagging algorithm

1. Does not require data scaling
2. Less chance of overfitting
3. Not highly damaged from small noise or outliers
4. Easily find importance of columns

1. Weak for catching patterns than boosting algorithms
2. Requires large memories
3. Can take **ONLY** float(or integer) type, never categorical or boolean, string itself. They need to be encoded to numeric.

Hyperparameters
| Name | Role | Effect When Increasing / Decreasing | dataset characteristic |
|------|-------|-------------------------------------|-----------------------|
| n_estimators | Number of trees | Increase: more stable, lower variance, slower.<br> Decrease: faster but higher variance. | more noise, higher this parameter |
| max_depth | Maximum depth of each tree | Increase: deeper, more complex, more overfitting.<br> Decrease: simpler, less overfitting, higher bias. | - |
| min_samples_split | Minimum samples to split a node | Increase: fewer splits, simpler trees.<br> Decrease: more splits, deeper trees. | More outliers, highter this parameter |
| min_samples_leaf | Minimum samples per leaf | Increase: smoother prediction, less overfitting.<br> Decrease: more complex tree, more overfitting. | For imbalanced dataset, it should be low enough to capture minority classes |
| max_features | Features considered at each split | Increase: trees become similar, lower randomness.<br> Decrease: higher randomness, better generalization. | Once important features are well known, higher this parameter |
| max_leaf_nodes | Maximum leaf nodes | Increase: more complex trees.<br> Decrease: simpler trees. | - |
| max_samples | Samples per tree (with bootstrap) | Increase: more data per tree, less randomness.<br> Decrease: faster, more randomness. | For very large dataset, decrease for speed |
| class_weight | Class imbalance handling | Higher minority weight: better recall, lower precision.<br> Lower weight: can ignore minority. | For imbalanced dataset, 'balanced' can take minority class work better |




In [6]:

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ==========================================
# 1. Data Preparation
# ==========================================
X = df.drop(columns=categorical_columns)
y = pd.read_csv("train_labels.csv")["damage_grade"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Model Initialization
# ==========================================
# n_jobs=-1 uses all available CPU cores
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# ==========================================
# 3. Hyperparameter Grid Definition
# ==========================================
param_grid = {
    'n_estimators': [50, 100],              # Number of trees
    'min_samples_leaf': [1, 2, 4],           # Min samples at leaf node
    'max_features': ['sqrt', 'log2'],        # Features to consider at split
    'class_weight': [None, 'balanced'],      # Handling imbalance
}

# ==========================================
# 4. Grid Search Execution
# ==========================================
gs = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                 # 5-Fold Cross Validation
    scoring='f1_macro',
    n_jobs=-1,            # Parallel processing
)

print("Starting Hyperparameter Tuning...")
gs.fit(X_train, y_train)

# ==========================================
# 5. Results & Evaluation
# ==========================================
print(f"Best Parameters: {gs.best_params_}")
print(f"Best CV Score: {gs.best_score_:.4f}")

# Predict using the best model found
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

print("\n[Test Set Evaluation]")
print(classification_report(y_test, y_pred))

# local CV Score : 0.6603

Starting Hyperparameter Tuning...
Best Parameters: {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 100}
Best CV Score: 0.6603

[Test Set Evaluation]
              precision    recall  f1-score   support

           1       0.53      0.66      0.59      5025
           2       0.75      0.74      0.75     29652
           3       0.70      0.66      0.68     17444

    accuracy                           0.71     52121
   macro avg       0.66      0.69      0.67     52121
weighted avg       0.71      0.71      0.71     52121



## CatBoost

### Categorical boosting.

1. Best for dealing categorical columns
2. very slower than GBM
3. weak for numeric columns than GBM
4. Very weak for sparse-matrix (like our 'has_superstructure_ \~~~ columns or has_secondary_usage_ ~~~ columns)
5. Many convenient black-box features like automatical hyperparameter tuning, filling missing values...
6. Least require of hyperparameter tuning


| Parameter | Description | If Increased | If Decreased | When You Should Tune It |
|----------|-------------|--------------|--------------|---------------------------|
| **iterations** | Total number of boosting rounds. | More complex model; higher chance of overfitting but higher accuracy. | Simpler model; might underfit. | Tune together with learning_rate. Increase when learning_rate is small. |
| **learning_rate** | Step size per boosting round. | Faster learning but may overfit or become unstable. | Slower learning; requires more iterations but more stable. | Always tune. Lower lr + higher iterations gives best results. |
| **depth** | Maximum depth of trees; affects interaction modeling. | More complex trees; risk of overfitting. | Simpler model; less expressive. | Tune when underfitting/overfitting. Usual range: 4–10. |
| **l2_leaf_reg** | L2 regularization on leaf weights (CatBoost’s only simple regularizer). | Stronger regularization → reduces variance. | Weaker regularization → more flexible but may overfit. | Tune if overfitting and depth/iterations already balanced. |
| **bagging_temperature** | Controls sampling randomness (0 = deterministic). | More randomness; can reduce overfitting. | Less randomness; more stable but might overfit. | Useful for large datasets or when overfitting. |
| **rsm** | Column sampling ratio per tree (feature subsampling). | Uses more features → more complex model. | More randomness → less overfitting; can speed training. | Tune for wide/high-dimensional datasets. |
| **leaf_estimation_iterations** | Number of gradient steps per leaf update. | More precise leaf estimation; slower but more accurate. | Faster but less accurate per tree. | Tune only for noisy or complex regression tasks. |
| **random_strength** | Randomness added to score when selecting splits. | More randomness → prevents overfitting. | More deterministic → possibly overfits. | Use when small dataset or overfitting. |
| **bootstrap_type** | Sampling method: Bayesian, Bernoulli, Poisson. | Affects robustness; some types fight overfitting more. | N/A | Tune mainly for imbalanced/large datasets. |
| **scale_pos_weight** | Weight for positive class (binary). | Improves recall for minority class. | Reduces sensitivity to minority class. | Quick alternative to class_weights for binary imbalance. |
| **auto_class_weights** | Automatic class weighting (“Balanced”). | Handles imbalance automatically. | No balancing. | When you want simple handling of imbalanced data. |


In [8]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# ==========================================
# 1. Data Preparation
# ==========================================
X = df.copy() 
y_raw = pd.read_csv("train_labels.csv")["damage_grade"]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Model Initialization
# ==========================================
cb = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    verbose=0,                 
    task_type='GPU',           
    devices='0'             
)

# ==========================================
# 3. Hyperparameter Grid Definition
# ==========================================
param_grid = {
    'iterations': [500, 1000],       # n_estimators
    'learning_rate': [0.1, 0.05],
    'depth': [6, 8],                 # very slow for deep trees
    'auto_class_weights': ['None', 'SqrtBalanced'], # usually more stable than 'Balanced'
}

# ==========================================
# 4. Grid Search Execution
# ==========================================
fit_params = {
    'cat_features': categorical_columns, 
    'early_stopping_rounds': 50,
    'verbose': 0
}

gs = GridSearchCV(
    estimator=cb,
    param_grid=param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=1,
    verbose=2
)

print("Starting Hyperparameter Tuning with CatBoost...")
gs.fit(X_train, y_train, **fit_params)

# ==========================================
# 5. Results & Evaluation
# ==========================================
print(f"Best Parameters: {gs.best_params_}")
print(f"Best CV Score: {gs.best_score_:.4f}")

best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

y_pred = y_pred.flatten()

y_test_org = le.inverse_transform(y_test)
y_pred_org = le.inverse_transform(y_pred)

print("\n[Test Set Evaluation]")
print(classification_report(y_test_org, y_pred_org))

# local CV Score : 0.6786

Starting Hyperparameter Tuning with CatBoost...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.1; total time=   4.0s
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.1; total time=   3.8s
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.1; total time=   3.8s
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.05; total time=   3.8s
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.05; total time=   3.7s
[CV] END auto_class_weights=None, depth=6, iterations=500, learning_rate=0.05; total time=   3.8s
[CV] END auto_class_weights=None, depth=6, iterations=1000, learning_rate=0.1; total time=   6.9s
[CV] END auto_class_weights=None, depth=6, iterations=1000, learning_rate=0.1; total time=   6.9s
[CV] END auto_class_weights=None, depth=6, iterations=1000, learning_rate=0.1; total time=   6.9s
[CV] END aut

### Ensemble

using votingClassifier

In [9]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

# ==========================================
# 1. Data Preparation
# ==========================================
X = df.copy() 
y_raw = pd.read_csv("train_labels.csv")["damage_grade"]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Non-Categorical Pipeline
# ==========================================
rf_preprocessor = ColumnTransformer(
    transformers=[
        ('drop_cat', 'drop', categorical_columns) # remove categorical columns
    ],
    remainder='passthrough'
)

rf_pipe = Pipeline([
    ('preprocessor', rf_preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=100, 
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=4
    ))
])

# ==========================================
# 3. For CatBoost Classifier
# ==========================================
cb_clf = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.1,
    depth=8,
    auto_class_weights='SqrtBalanced',
    cat_features=categorical_columns, # Important!
    task_type='GPU',
    devices='0',
    verbose=0,
    random_seed=42
)

# ==========================================
# 4. VotingClassifier definition
# ==========================================
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_pipe),  # name, model pipeline
        ('cb', cb_clf)    # name, model pipeline
    ],
    voting='soft',        # hard : majority voting, soft : probability based voting
    n_jobs=1              
)

params = {
    'weights': [
        [1, 2],    
    ]
}

grid_vote = GridSearchCV(
    estimator=voting_clf,
    param_grid=params,
    cv=3,
    scoring='f1_macro',
    n_jobs=1,  
    verbose=2
)

print("Ensemble Model Training with VotingClassifier...")
grid_vote.fit(X_train, y_train)
best_model = grid_vote.best_estimator_
y_train_pred = best_model.predict(X_train)
train_score = f1_score(y_train, y_train_pred, average='macro')

# ==========================================
# 5. Results
# ==========================================
print(f"Best Weights: {grid_vote.best_params_}")
print(f"Best Ensemble Score: {grid_vote.best_score_:.4f}")
print(f"Training Score (Check overfitting): {train_score:.4f}")

final_model = grid_vote.best_estimator_
y_pred = final_model.predict(X_test)

#local CV Score : 0.6804

Ensemble Model Training with VotingClassifier...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .....................................weights=[1, 2]; total time=  17.9s
[CV] END .....................................weights=[1, 2]; total time=  18.1s
[CV] END .....................................weights=[1, 2]; total time=  19.7s
Best Weights: {'weights': [1, 2]}
Best Ensemble Score: 0.6803
Training Score (Check overfitting): 0.7792


In [10]:
import pandas as pd

# ==========================================
# 1. test data load
# ==========================================
print("Test Data Loading...")
test_df = pd.read_csv("test_values.csv")
building_ids = test_df["building_id"] # id for submission

for col in categorical_columns:
    test_df[col] = test_df[col].astype('category')

# ==========================================
# 2. Final Prediction (Ensemble Model)
# ==========================================
print("Predicting with Final Ensemble Model...")
final_model = grid_vote.best_estimator_
y_test_pred = final_model.predict(test_df)

# ==========================================
# 3. Label Inverse Transformation
# ==========================================

y_test_pred_org = le.inverse_transform(y_test_pred)

# ==========================================
# 4. CSV 저장
# ==========================================
submission = pd.DataFrame({
    "building_id": building_ids,
    "damage_grade": y_test_pred_org
})

submission.to_csv("UwUSuperCute.csv", index=False)

print("UwUSuperCute.csv Done!")

Test Data Loading...
Predicting with Final Ensemble Model...
UwUSuperCute.csv Done!
