# CatBoost Select Features

In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v20.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v20.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [3]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster']

In [4]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)

In [5]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

specific_energy_1                   0.406441
vlof_tas                            0.406441
sqrd_tas_1                          0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_1                               0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
tas_3                               0.359277
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
specific_energy_10                  0.356763
sqrd_tas_10                         0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
specific_energy_9                   0.353705
tas_9                               0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
specific_e

In [6]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [7]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [8]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [9]:
X = df.drop('tow', axis=1)
y = df.tow

In [10]:
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Create correlation matrix
# corr_matrix = X.corr(numeric_only=True).abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# # Find features with correlation greater than 0.98
# to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]
# print(to_drop)

# # Plotting the correlation matrix
# plt.figure(figsize=(12, 8))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Matrix of Challenge Set')
# plt.show()

# # # Drop features 
# # X.drop(to_drop, axis=1, inplace=True)

In [14]:
from catboost import CatBoostRegressor, Pool, metrics
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'learning_rate': 0.05, 
    'reg_lambda': 69.07051080443502, 
    'random_strength': 16.348436502754343, 
    'depth': 11, 
    'min_data_in_leaf': 2, 
    'leaf_estimation_iterations': 7
}

selected_cat_names = [x for x in cat_names if x in X.columns]

train_pool = Pool(X_train, y_train, cat_features=selected_cat_names)
val_pool = Pool(X_val, y_val, cat_features=selected_cat_names)

model = CatBoostRegressor(
    iterations=10000,
    objective=metrics.RMSE(),
    eval_metric=metrics.RMSE(),
    random_seed=42,
    verbose=False,
    task_type='GPU',
    use_best_model=True,
    od_type='Iter',
    od_wait=20,
    **params,
)

In [None]:
result = model.select_features(
    train_pool, eval_set=val_pool,
    features_for_select=X_train.columns.tolist(),
    num_features_to_select=80,
    steps=20,
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 20


In [None]:
print(result['eliminated_features_names'])