In [28]:
import pandas as pd

df = pd.read_csv("train_values.csv")

categorical_columns = [
     "roof_type               ",
     "land_surface_condition  ",
     "legal_ownership_status  ",
     "other_floor_type        ",
     "position                ",
     "foundation_type         ",
     "ground_floor_type       ",
     "count_floors_pre_eq     ",
     "count_families          ",
     "plan_configuration      " 
]

bool_columns = [
      "has_superstructure_adobe_mud                ",
      "has_superstructure_bamboo                   ",
      "has_secondary_use_rental                    ", 
      "has_secondary_use_hotel                     ", 
      "has_secondary_use                           ",  
      "has_secondary_use_agriculture               ", 
      "has_superstructure_other                    ", 
      "has_superstructure_rc_engineered            ",  
      "has_superstructure_rc_non_engineered        ",  
      "has_superstructure_cement_mortar_stone      ",  
      "has_superstructure_timber                   ",  
      "has_superstructure_cement_mortar_brick      ",  
      "has_superstructure_mud_mortar_brick         ",  
      "has_superstructure_mud_mortar_stone         ",  
      "has_superstructure_stone_flag               ",  
      "has_secondary_use_institution               ",  
      "has_secondary_use_health_post               ",  
      "has_secondary_use_other                     ",  
      "has_secondary_use_use_police                ",  
      "has_secondary_use_gov_office                ",  
      "has_secondary_use_school                    ",  
      "has_secondary_use_industry                  "  
]
categorical_columns = list(map(lambda x: x.strip(), categorical_columns))
bool_columns = list(map(lambda x: x.strip(), bool_columns))

df[categorical_columns] = df[categorical_columns].astype("category")
df[bool_columns] = df[bool_columns].astype("bool")

def print_style(msg:str):
      print("\n----------------------------")
      print(msg)
      print("----------------------------\n")

In [29]:
print_style(f"shape of data set : {df.shape}")
print_style(f"dtypes of data set : {df.dtypes}")
print_style(f"info of data set : {df.info()}")
print_style(f"Null values in dataset : {df.isnull().sum()}")
print_style(f"Unique values in dataset sorted: {df.nunique().sort_values()}")
df.head()
df.tail()



----------------------------
shape of data set : (260601, 39)
----------------------------


----------------------------
dtypes of data set : building_id                                  int64
geo_level_1_id                               int64
geo_level_2_id                               int64
geo_level_3_id                               int64
count_floors_pre_eq                       category
age                                          int64
area_percentage                              int64
height_percentage                            int64
land_surface_condition                    category
foundation_type                           category
roof_type                                 category
ground_floor_type                         category
other_floor_type                          category
position                                  category
plan_configuration                        category
has_superstructure_adobe_mud                  bool
has_superstructure_mud_mortar_stone     

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
260596,688636,25,1335,1621,1,55,6,3,n,r,...,False,False,False,False,False,False,False,False,False,False
260597,669485,17,715,2060,2,0,6,5,t,r,...,False,False,False,False,False,False,False,False,False,False
260598,602512,17,51,8163,3,55,6,7,t,r,...,False,False,False,False,False,False,False,False,False,False
260599,151409,26,39,1851,2,10,14,6,t,r,...,False,False,False,False,False,False,False,False,False,False
260600,747594,21,9,9101,3,10,7,6,n,r,...,False,False,False,False,False,False,False,False,False,False


### Conclusion :
 - shape : (260601, 39)
 - no missing elements
 - types are only int, bool and str
 - many booleans and categorical data (could be replaced or factorized) :
      - Unique values in dataset sorted:
      booleans                                       2
            superstructure abode (mud, mud mortar stone, stone falge, cement mortar stone, mud mortart brick, cement mortar brick, timber, bamboo, rc (non) engirneered, other)
            secondary use (0/1, agriculture, hotel, rental, institution, school, industry, health post, gov office, police, other)
      roof_type                                      3
      land_surface_condition                         3
      legal_ownership_status                         4
      other_floor_type                               4
      position                                       4
      foundation_type                                5
      ground_floor_type                              5
      count_floors_pre_eq                            9
      count_families                                10
      plan_configuration                            10
      height_percentage                             27
      geo_level_1_id                                31
      age                                           42
      area_percentage                               84
      geo_level_2_id                              1414
      geo_level_3_id                             11595
      building_id                               260601

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def safe_sort_unique(vals):
    try:
        return sorted(vals, key=lambda x: float(x))
    except:
        return sorted(vals)

def plot_bool(col):
    plt.figure(figsize=(6, 4))
    plt.title(f"Distribution of {col.name}", fontsize=14)

    counts = col.value_counts().sort_index()
    plt.bar(counts.index.astype(int), counts.values)
    plt.xticks([0, 1], ['False', 'True'])
    plt.xlabel(col.name)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

def plot_categorical(col):
    plt.figure(figsize=(8, 4))
    plt.title(f"{col.name} - Categorical Distribution", fontsize=14)
    
    # Get value counts and sort
    counts = col.astype(str).value_counts()
    order = safe_sort_unique(counts.index)
    counts = counts.reindex(order)
    
    plt.bar(range(len(counts)), counts.values)
    plt.xticks(range(len(counts)), counts.index, rotation=45, ha='right')
    plt.xlabel(col.name)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

def plot_ints(col, log_numeric=[]):
    # Remove NaN values
    col_clean = col.dropna()
    
    if len(col_clean) == 0:
        print(f"Warning: {col.name} has no valid data to plot")
        return
    
    if col.name in log_numeric:
        fig, axes = plt.subplots(1, 2, figsize=(14, 4))
        fig.suptitle(f"Distribution of {col.name}", fontsize=15)

        # Linear scale
        sns.histplot(col_clean, kde=True, stat="density", ax=axes[0])
        axes[0].set_title(f"{col.name} - Linear Scale")
        axes[0].set_xlabel(col.name)

        # Log scale (visual only) - filter out zeros and negatives
        col_positive = col_clean[col_clean > 0]
        if len(col_positive) > 0:
            sns.histplot(col_positive, kde=True, stat="density", ax=axes[1])
            axes[1].set_xscale('log')
            axes[1].set_title(f"{col.name} - Log Scale (positive values only)")
            axes[1].set_xlabel(col.name)
        else:
            axes[1].text(0.5, 0.5, 'No positive values for log scale', 
                        ha='center', va='center', transform=axes[1].transAxes)
            axes[1].set_title(f"{col.name} - Log Scale (no data)")

        plt.tight_layout()
        plt.show()
    else:
        plt.figure(figsize=(8, 4))
        plt.title(f"Distribution of {col.name}", fontsize=14)
        sns.histplot(col_clean, kde=True, stat="density")
        plt.xlabel(col.name)
        plt.ylabel("Density")
        plt.tight_layout()
        plt.show()

def plot_all_data(df):
    LOG_NUMERIC = ["age", "area_percentage", "height_percentage"]

    for col_name in df.columns:
        col = df[col_name]
        col_dtype = col.dtype

        # CRITICAL FIX: Check for bool BEFORE integer, because bool is a subtype of int
        if pd.api.types.is_bool_dtype(col_dtype):
            plot_bool(col)
        elif pd.api.types.is_integer_dtype(col_dtype):
            plot_ints(col, log_numeric=LOG_NUMERIC)
        elif pd.api.types.is_categorical_dtype(col_dtype) or pd.api.types.is_object_dtype(col_dtype):
            plot_categorical(col)

#plot_all_data(df)

In [31]:
from pandas import DataFrame


def fix_imbalance(df, dtype, rf_hyperparameters:dict, basic_thresholds:dict, use_thresholds:bool):
      ### not all rf_hyperparameters should be used only the relevant ones
      ### multiple transformations should be applied : reduction + duplication + ... (all if needed of course)
      ### at each step we could either use a formulas of rf_hyperparameters to match a basic threshold (drop_rate, duplicate_rate, imbalance level, ...)
      ### we could decide to either use the basic_thresholds or the rf_hyperparameters based on use_thresholds 
      match dtype:
            case "int64":
                  # call a function
                  ...
            case "bool":
                  # call a function
                  ...
            case "category":
                  # call a function
                  ...
      ...



def preprocess_rf(df:DataFrame, bool_thresh=0.95, cat_min_freq=0.02, clip_quantile=None):
      preprocess_df = pd>DataFrame()
      for col in df.columns:
            # modify the cols and save them in preprocess_df using the fix_imbalance() function
            ...


In [32]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from typing import Tuple, Optional
import warnings


"""
      drop thresholds : 
            bool : 1 - safety_factor * (min_samples_leaf / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
            categorical : (min_samples_split / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
      skew fix to log threshold : 
            n_samples / (min_samples_leaf * (n_estimators ** 0.25))
      consider categorical threshold : 
            n_unique <= min(20, int(np.sqrt(n_samples) / 10)) where n_unique is the # of appearance of a value
      
      other functions are just implentations 

"""


def calculate_bool_drop_threshold(n_samples: int, rf_params: dict, safety_factor: float = 1.5) -> float:
    """
    Calculate threshold for dropping imbalanced boolean features.
    Drop if: max(p_0, p_1) > threshold
    threshold = 1 - safety_factor * (min_samples_leaf / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
    """
    min_samples_leaf = rf_params.get('min_samples_leaf', 1)
    max_features = rf_params.get('max_features', 0.33)  # default to sqrt
    n_estimators = rf_params.get('n_estimators', 100)
    
    # Convert max_features to fraction if it's a string
    if isinstance(max_features, str):
        if max_features == 'sqrt':
            max_features = 0.33
        elif max_features == 'log2':
            max_features = 0.2
        else:
            max_features = 1.0
    
    threshold = 1 - safety_factor * (min_samples_leaf / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
    return min(0.999, max(0.5, threshold))  # Clamp between 0.5 and 0.999

def calculate_categorical_merge_threshold(n_samples: int, rf_params: dict) -> float:
    """
    Calculate threshold for merging rare categorical values.
    Merge if: frequency < threshold
    threshold = (min_samples_split / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
    """
    min_samples_split = rf_params.get('min_samples_split', 2)
    max_features = rf_params.get('max_features', 0.33)
    n_estimators = rf_params.get('n_estimators', 100)
    
    # Convert max_features to fraction
    if isinstance(max_features, str):
        if max_features == 'sqrt':
            max_features = 0.33
        elif max_features == 'log2':
            max_features = 0.2
        else:
            max_features = 1.0
    
    threshold = (min_samples_split / n_samples) * (1 / max_features) * (1 / np.sqrt(n_estimators))
    return max(0.0001, min(0.1, threshold))  # Clamp between 0.01% and 10%

def calculate_skew_transform_threshold(n_samples: int, rf_params: dict) -> Tuple[float, float]:
    """
    Calculate thresholds for log-transforming skewed numeric features.
    Returns: (skew_threshold, range_threshold)
    """
    min_samples_leaf = rf_params.get('min_samples_leaf', 1)
    n_estimators = rf_params.get('n_estimators', 100)
    
    skew_threshold = 3.0  # Standard threshold for high skewness
    range_threshold = n_samples / (min_samples_leaf * (n_estimators ** 0.25))
    
    return skew_threshold, max(100, range_threshold)

def should_treat_as_categorical(n_unique: int, n_samples: int) -> bool:
    """
    Determine if a small-integer column should be treated as categorical.
    """
    return n_unique <= min(20, int(np.sqrt(n_samples) / 10))

def fix_bool_imbalance(col: pd.Series, n_samples: int, rf_params: dict, 
                       basic_threshold: float, use_formulas: bool) -> Optional[pd.Series]:
    """
    Fix boolean column imbalance by dropping if too imbalanced.
    Returns None if column should be dropped.
    """
    value_counts = col.value_counts(normalize=True)
    if len(value_counts) == 0:
        return None
    
    max_freq = value_counts.iloc[0]
    
    # Determine threshold
    if use_formulas:
        threshold = calculate_bool_drop_threshold(n_samples, rf_params)
    else:
        threshold = basic_threshold
    
    # Drop if too imbalanced
    if max_freq > threshold:
        print(f"   ‚ùå DROP: {col.name} (max_freq={max_freq:.4f} > threshold={threshold:.4f})")
        return None
    
    return col

def fix_categorical_imbalance(col: pd.Series, n_samples: int, rf_params: dict,
                               basic_threshold: float, use_formulas: bool,
                               min_categories: int = 10) -> pd.Series:
    """
    Fix categorical column by merging rare categories into 'Other'.
    """
    col = col.copy()
    n_unique = col.nunique()
    
    # Don't merge if low cardinality
    if n_unique <= min_categories:
        return col
    
    freq = col.value_counts(normalize=True)
    
    # Determine threshold
    if use_formulas:
        threshold = calculate_categorical_merge_threshold(n_samples, rf_params)
    else:
        threshold = basic_threshold
    
    # Find rare categories
    rare_categories = freq[freq < threshold].index.tolist()
    
    if rare_categories:
        col = col.replace(rare_categories, 'Other')
        print(f"   üîß MERGE: {col.name} - merged {len(rare_categories)}/{n_unique} categories (freq < {threshold:.4f})")
    
    return col

def fix_numeric_skewness(col: pd.Series, n_samples: int, rf_params: dict,
                         use_formulas: bool) -> pd.Series:
    """
    Fix highly skewed numeric columns with log1p transform.
    """
    col = col.copy()
    
    # Skip if has negative values
    if col.min() < 0:
        return col
    
    # Skip if all zeros or constant
    if col.nunique() <= 1:
        return col
    
    # Calculate skewness and range
    skew_val = col.skew()
    col_min = col.min()
    col_max = col.max()
    
    if col_min == 0:
        value_range = col_max
    else:
        value_range = col_max / col_min
    
    # Determine thresholds
    if use_formulas:
        skew_threshold, range_threshold = calculate_skew_transform_threshold(n_samples, rf_params)
    else:
        skew_threshold = 3.0
        range_threshold = 1000
    
    # Transform if both conditions met
    if skew_val > skew_threshold and value_range > range_threshold:
        col = np.log1p(col)
        print(f"   üìä LOG-TRANSFORM: {col.name} (skew={skew_val:.2f}, range={value_range:.2f})")
    
    return col

def fix_small_int_as_categorical(col: pd.Series, n_samples: int, rf_params: dict,
                                  basic_threshold: float, use_formulas: bool,
                                  min_categories: int = 10) -> pd.Series:
    """
    Treat small-integer columns as categorical and apply categorical preprocessing.
    """
    n_unique = col.nunique()
    
    # Check if should be treated as categorical
    if not should_treat_as_categorical(n_unique, n_samples):
        return col
    
    print(f"   üîÑ TREAT AS CATEGORICAL: {col.name} ({n_unique} unique values)")
    
    # Convert to categorical and apply categorical fix
    col_cat = col.astype(str).astype('category')
    return fix_categorical_imbalance(col_cat, n_samples, rf_params, basic_threshold, use_formulas, min_categories)

def fix_imbalance(col: pd.Series, df_shape: Tuple[int, int], rf_hyperparameters: dict,
                  basic_thresholds: dict, use_thresholds: bool) -> Optional[pd.Series]:
    """
    Fix imbalance in a single column based on its dtype.
    
    Args:
        col: pandas Series to preprocess
        df_shape: (n_samples, n_features) tuple
        rf_hyperparameters: dict with RF params (n_estimators, max_depth, etc.)
        basic_thresholds: dict with manual thresholds (bool_thresh, cat_min_freq, etc.)
        use_thresholds: if True, use basic_thresholds; if False, use RF formulas
        
    Returns:
        Preprocessed column or None if should be dropped
    """
    n_samples = df_shape[0]
    dtype_str = str(col.dtype)
    
    # Boolean columns
    if pd.api.types.is_bool_dtype(col):
        return fix_bool_imbalance(
            col, n_samples, rf_hyperparameters,
            basic_thresholds.get('bool_thresh', 0.95),
            not use_thresholds  # use_formulas is opposite of use_thresholds
        )
    
    # Integer columns
    elif pd.api.types.is_integer_dtype(col):
        n_unique = col.nunique()
        
        # Check if should be treated as categorical
        if should_treat_as_categorical(n_unique, n_samples):
            return fix_small_int_as_categorical(
                col, n_samples, rf_hyperparameters,
                basic_thresholds.get('cat_min_freq', 0.02),
                not use_thresholds
            )
        else:
            # Apply numeric skewness fix
            return fix_numeric_skewness(col, n_samples, rf_hyperparameters, not use_thresholds)
    
    # Categorical columns
    elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
        return fix_categorical_imbalance(
            col, n_samples, rf_hyperparameters,
            basic_thresholds.get('cat_min_freq', 0.02),
            not use_thresholds
        )
    
    # Float columns (numeric)
    elif pd.api.types.is_float_dtype(col):
        return fix_numeric_skewness(col, n_samples, rf_hyperparameters, not use_thresholds)
    
    # Unknown dtype - return as is
    else:
        warnings.warn(f"Unknown dtype for column {col.name}: {dtype_str}")
        return col

def preprocess_rf(df: DataFrame, 
                  rf_hyperparameters: Optional[dict] = None,
                  use_rf_formulas: bool = True,
                  bool_thresh: float = 0.95,
                  cat_min_freq: float = 0.02,
                  min_categories: int = 10) -> DataFrame:
    """
    Preprocess dataframe for Random Forest by fixing feature imbalances.
    
    Args:
        df: Input dataframe
        rf_hyperparameters: RF hyperparameters dict. If None, uses defaults.
        use_rf_formulas: If True, calculate thresholds from RF params. If False, use manual thresholds.
        bool_thresh: Manual threshold for dropping boolean columns (if use_rf_formulas=False)
        cat_min_freq: Manual threshold for merging categories (if use_rf_formulas=False)
        min_categories: Don't merge categories if column has fewer than this many unique values
        
    Returns:
        Preprocessed dataframe
    """
    # Default RF hyperparameters
    if rf_hyperparameters is None:
        rf_hyperparameters = {
            'n_estimators': 100,
            'max_depth': None,
            'min_samples_split': 20,
            'min_samples_leaf': 10,
            'max_features': 'sqrt'
        }
    
    basic_thresholds = {
        'bool_thresh': bool_thresh,
        'cat_min_freq': cat_min_freq
    }
    
    print("=" * 70)
    print("üå≤ RANDOM FOREST PREPROCESSING")
    print("=" * 70)
    print(f"üìã Input shape: {df.shape}")
    print(f"‚öôÔ∏è  RF Hyperparameters: {rf_hyperparameters}")
    print(f"üéØ Using {'RF formulas' if use_rf_formulas else 'manual thresholds'}")
    
    if not use_rf_formulas:
        print(f"   - bool_thresh: {bool_thresh}")
        print(f"   - cat_min_freq: {cat_min_freq}")
    print()
    
    preprocessed_df = pd.DataFrame()
    dropped_cols = []
    
    for col_name in df.columns:
        print(f"Processing: {col_name} (dtype: {df[col_name].dtype})")
        
        result = fix_imbalance(
            df[col_name],
            df.shape,
            rf_hyperparameters,
            basic_thresholds,
            use_thresholds=not use_rf_formulas
        )
        
        if result is not None:
            preprocessed_df[col_name] = result
        else:
            dropped_cols.append(col_name)
        
        print()
    
    print("=" * 70)
    print(f"‚úÖ Preprocessing complete!")
    print(f"üìã Output shape: {preprocessed_df.shape}")
    print(f"üóëÔ∏è  Dropped {len(dropped_cols)} columns: {dropped_cols}")
    print("=" * 70)
    
    return preprocessed_df

if __name__ == "__main__":
    # Example with RF formulas (recommended)
    rf_params = {
        'n_estimators': 1000,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    }

    
    # Using RF formulas
    df_processed = preprocess_rf(df, rf_hyperparameters=rf_params, use_rf_formulas=True)
    #plot_all_data(df_processed)
    # Or using manual thresholds
    # df_processed = preprocess_rf(df, use_rf_formulas=False, bool_thresh=0.95, cat_min_freq=0.02)

üå≤ RANDOM FOREST PREPROCESSING
üìã Input shape: (260601, 39)
‚öôÔ∏è  RF Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
üéØ Using RF formulas

Processing: building_id (dtype: int64)

Processing: geo_level_1_id (dtype: int64)

Processing: geo_level_2_id (dtype: int64)

Processing: geo_level_3_id (dtype: int64)

Processing: count_floors_pre_eq (dtype: category)

Processing: age (dtype: int64)

Processing: area_percentage (dtype: int64)

Processing: height_percentage (dtype: int64)

Processing: land_surface_condition (dtype: category)

Processing: foundation_type (dtype: category)

Processing: roof_type (dtype: category)

Processing: ground_floor_type (dtype: category)

Processing: other_floor_type (dtype: category)

Processing: position (dtype: category)

Processing: plan_configuration (dtype: category)

Processing: has_superstructure_adobe_mud (dtype: bool)

Processing: has_superstructure_mud_mortar_stone (dtype: bool)



  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):
  elif pd.api.types.is_categorical_dtype(col) or pd.api.types.is_object_dtype(col):


In [None]:

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report, accuracy_score

# ==========================================
# 1. Data Preparation
# ==========================================
X = df_processed.copy()
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X = df_processed.copy()

X[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])
y = pd.read_csv("train_labels.csv")["damage_grade"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Model Initialization
# ==========================================
# n_jobs=-1 uses all available CPU cores
rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    max_depth=30,
    min_samples_leaf=2,
    class_weight=None,
    n_estimators=400,
    )

# ==========================================
# 3. Hyperparameter Grid Definition
# ==========================================
param_grid = {
    'n_estimators': [400],              # Number of trees
    'max_depth': [30],        # Max depth of trees
    'max_features': [ 0.5],        # Features to consider at split
    'min_samples_leaf': [2],           # Min samples at leaf node
    'class_weight': [None],      # Handling imbalance
}

# ==========================================
# 4. Grid Search Execution
# ==========================================
gs = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=2,                 # 5-Fold Cross Validation
    scoring='f1_micro',
    n_jobs=-1,            # Parallel processing
)

print("Starting Hyperparameter Tuning...")
gs.fit(X_train, y_train)

# ==========================================
# 5. Results & Evaluation
# ==========================================
print(f"Best Parameters: {gs.best_params_}")
print(f"Best CV Score: {gs.best_score_:.4f}")

# Predict using the best model found
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

print("\n[Test Set Evaluation]")
print(classification_report(y_test, y_pred))

#local best CV Score : 0.7264

Starting Hyperparameter Tuning...
Best Parameters: {'class_weight': None, 'max_depth': 30, 'max_features': 0.5, 'min_samples_leaf': 2, 'n_estimators': 400}
Best CV Score: 0.7264

[Test Set Evaluation]
              precision    recall  f1-score   support

           1       0.68      0.49      0.57      5025
           2       0.74      0.85      0.79     29652
           3       0.76      0.63      0.69     17444

    accuracy                           0.74     52121
   macro avg       0.73      0.66      0.68     52121
weighted avg       0.74      0.74      0.73     52121



In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# ==========================================
# 1. Data Preparation
# ==========================================
X = df_processed.copy() 
y_raw = pd.read_csv("train_labels.csv")["damage_grade"]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Model Initialization
# ==========================================

model = CatBoostClassifier(
    iterations=10000,          
    learning_rate=0.05,        
    depth=8,                     
    devices='0',
    one_hot_max_size=10,    
    task_type='GPU',           
    early_stopping_rounds=100, 
    verbose=500,
    cat_features=categorical_columns,                
)
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=categorical_columns, 
    use_best_model=True
)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Final Model Acc: {acc:.5f}")

y_test_org = le.inverse_transform(y_test)
y_pred_org = le.inverse_transform(y_pred)

print("\n[Test Set Evaluation]")
print(classification_report(y_test_org, y_pred_org))

# local CV Score : 0.73982

0:	learn: 1.0690782	test: 1.0687655	best: 1.0687655 (0)	total: 7.73ms	remaining: 1m 17s
500:	learn: 0.6263108	test: 0.6356822	best: 0.6356822 (500)	total: 3.79s	remaining: 1m 11s
1000:	learn: 0.5916342	test: 0.6152120	best: 0.6152120 (1000)	total: 7.55s	remaining: 1m 7s
1500:	learn: 0.5677646	test: 0.6050798	best: 0.6050798 (1500)	total: 11.3s	remaining: 1m 4s
2000:	learn: 0.5483143	test: 0.5995253	best: 0.5995253 (2000)	total: 15.1s	remaining: 1m
2500:	learn: 0.5310903	test: 0.5954759	best: 0.5954759 (2500)	total: 18.9s	remaining: 56.7s
3000:	learn: 0.5154796	test: 0.5928092	best: 0.5928092 (3000)	total: 22.8s	remaining: 53.1s
3500:	learn: 0.5010075	test: 0.5906350	best: 0.5906350 (3500)	total: 26.6s	remaining: 49.3s
4000:	learn: 0.4871891	test: 0.5889651	best: 0.5889463 (3995)	total: 30.5s	remaining: 45.7s
4500:	learn: 0.4744633	test: 0.5880240	best: 0.5880178 (4499)	total: 34.4s	remaining: 42s
5000:	learn: 0.4625485	test: 0.5873229	best: 0.5873229 (5000)	total: 38.3s	remaining: 38.2

  y = column_or_1d(y, warn=True)


In [None]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

# ==========================================
# 1. Data Preparation
# ==========================================
X = df_processed.copy() 
y_raw = pd.read_csv("train_labels.csv")["damage_grade"]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. Non-Categorical Pipeline
# ==========================================

# For RF Classifier
rf_preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal Encoding
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ],
    remainder='passthrough' 
)
rf_pipe = Pipeline([
    ('preprocessor', rf_preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=400, 
        min_samples_leaf=2,
        max_depth=30,
        class_weight=None,
        max_features=0.5,
        random_state=42,
        n_jobs=4
    ))
])

# ==========================================
# 3. For CatBoost Classifier
# ==========================================
cb_clf = CatBoostClassifier(
    iterations=4750,          
    learning_rate=0.05,        
    depth=8,                     
    devices='0',
    one_hot_max_size=10,    
    task_type='GPU',           
    early_stopping_rounds=100, 
    verbose=500,
    cat_features=categorical_columns,                
)

# ==========================================
# 4. VotingClassifier definition
# ==========================================
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_pipe),  # name, model pipeline
        ('cb', cb_clf)    # name, model pipeline
    ],
    voting='soft',        # hard : majority voting, soft : probability based voting
    weights=[44, 56],
    n_jobs=1              
)
'''
params = {
    'weights': [
        [1, 2],   
    ]
}

grid_vote = GridSearchCV(
    estimator=voting_clf,
    param_grid=params,
    cv=2,
    scoring='f1_micro',
    n_jobs=1,  
    verbose=2
)

print("Ensemble Model Training with VotingClassifier...")
grid_vote.fit(X_train, y_train)
best_model = grid_vote.best_estimator_


print(f"Best Weights: {grid_vote.best_params_}")
print(f"Best Ensemble Score: {grid_vote.best_score_:.4f}")

'''
voting_clf.fit(X_train, y_train)


# ==========================================
# 5. Results
# ==========================================

final_model = voting_clf
y_pred = final_model.predict(X_test)
final_score = f1_score(y_test, y_pred, average='micro')
print(f"Final Ensemble Score (Micro F1): {final_score:.5f}")

#local CV Score : 0.74388

0:	learn: 1.0690780	total: 8.59ms	remaining: 40.8s
500:	learn: 0.6263109	total: 4.39s	remaining: 37.3s
1000:	learn: 0.5916342	total: 8.56s	remaining: 32.1s
1500:	learn: 0.5677646	total: 12.7s	remaining: 27.6s
2000:	learn: 0.5483144	total: 16.9s	remaining: 23.2s
2500:	learn: 0.5310904	total: 21.1s	remaining: 19s
3000:	learn: 0.5154796	total: 25.3s	remaining: 14.8s
3500:	learn: 0.5010076	total: 29.6s	remaining: 10.5s
4000:	learn: 0.4871891	total: 33.8s	remaining: 6.33s
4500:	learn: 0.4744632	total: 38.1s	remaining: 2.1s
4749:	learn: 0.4684491	total: 40.2s	remaining: 0us
Final Ensemble Score (Micro F1): 0.74388
