In [1]:
import pandas as pd
import numpy as np

## Preprocessamento dos dados

In [2]:
# Load the dataset after the exploratory data analysis
# challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v7.csv")
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v8.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
# submission_set_updated = pd.read_csv("./data/submission_set_updated_v7.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v8.csv")

# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

df.head()

Unnamed: 0,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_100,track_variation_ENR,average_vertical_rate_ARR_100,average_vertical_rate_DEP_100,...,Altitude_adep,Latitude_ades,Longitude_ades,Altitude_ades,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow
0,A320,M,a73f82288988b79be490c6322f4c32ed,18.0,321,124.339907,39.704139,2.102124,-1276.005427,1683.211169,...,25,51.843,-8.492,154,129,277.346455,0.232363,13,0,54748.0
1,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,13.0,4193,179.31889,8.910306,76.029652,-858.158167,1682.557205,...,4,25.793,-80.291,3,-1,285.530704,-0.000133,19,12,185441.0
2,A333,H,8be5c854fd664bcb97fb543339f74770,15.0,3770,179.638848,32.391881,85.115541,-791.849866,840.240481,...,38,41.979,-87.904,204,166,305.643136,0.024214,8,15,230396.0
3,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,11.0,3607,0.208347,36.998783,101.712676,-690.489814,1851.136442,...,432,39.872,-75.241,12,-420,296.005451,-0.065017,2,1,157615.0
4,A21N,M,a73f82288988b79be490c6322f4c32ed,14.0,305,,174.878799,,,2754.662045,...,74,51.477,-0.461,25,-49,116.445141,-0.10915,0,13,70318.447226


## Modelo (com pre-processamento auto)

In [3]:
cat_names = ['aircraft_type', 
             'wtc', 
             'airline',
             'offblock_hour',
             'offblock_minute', 
             'offblock_day_of_week',
             'offblock_month',
             'offblock_week_of_year', 
             'offblock_season', 
             'arrival_hour',
             'arrival_minute',
             'is_offblock_weekend',
             'is_offblock_rush_hour',
             'flight_duration_category',                       
             'adep_region', 
             'ades_region', 
             'same_country_flight',
             'same_region_flight',                        
             'flight_direction',
             'is_intercontinental',
             'Manufacturer',
             'Model_FAA',
             'Physical_Class_Engine',
             'FAA_Weight']

In [4]:
null_value_stats = df.isnull().sum(axis=0)
print(null_value_stats)

aircraft_type         0
wtc                   0
airline               0
taxiout_time          0
flown_distance        0
                     ..
bearing               0
elevation_gradient    0
adep_geo_cluster      0
ades_geo_cluster      0
tow                   0
Length: 76, dtype: int64


In [5]:
X = df.drop('tow', axis=1)
y = df.tow

In [6]:
print(X.dtypes)

aircraft_type           object
wtc                     object
airline                 object
taxiout_time           float64
flown_distance           int64
                        ...   
altitude_difference      int64
bearing                float64
elevation_gradient     float64
adep_geo_cluster         int64
ades_geo_cluster         int64
Length: 75, dtype: object


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from catboost import CatBoostRegressor, Pool, metrics, cv
from sklearn.metrics import r2_score, mean_squared_error

In [11]:
model = CatBoostRegressor(
    objective='RMSE',
    learning_rate=0.3,
    iterations=5000,
    random_seed=42,
    logging_level='Silent',
    use_best_model=True,
    task_type='GPU',
)

In [12]:
model.fit(
    X_train, y_train,
    cat_features=cat_names,
    eval_set=(X_val, y_val),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7a20f814bf40>

In [11]:
params = {
    'objective': 'RMSE',
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.RMSE(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False,
}
train_pool = Pool(X_train, y_train, cat_features=cat_names)
val_pool = Pool(X_val, y_val, cat_features=cat_names)

In [12]:
%%time
from sklearn.metrics import root_mean_squared_error

model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=val_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostRegressor(**best_model_params)
best_model.fit(train_pool, eval_set=val_pool)

print('Simple model validation MSE: {:.4}'.format(
    root_mean_squared_error(y_val, model.predict(X_val))
))
print('')

print('Best model validation MSE: {:.4}'.format(
    root_mean_squared_error(y_val, best_model.predict(X_val))
))

KeyboardInterrupt: 

## Training on GPU

In [13]:
params = {
    'objective': 'RMSE',
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.RMSE(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False,
    'task_type': 'GPU',
}
train_pool = Pool(X_train, y_train, cat_features=cat_names)
val_pool = Pool(X_val, y_val, cat_features=cat_names)

In [None]:
%%time
from sklearn.metrics import mean_squared_error

model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=val_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostRegressor(**best_model_params)
best_model.fit(train_pool, eval_set=val_pool)

print('Simple model validation MSE: {:.4}'.format(
    root_mean_squared_error(y_val, model.predict(X_val))
))
print('')

print('Best model validation MSE: {:.4}'.format(
    root_mean_squared_error(y_val, best_model.predict(X_val))
))

## Features Importances

In [None]:
model = CatBoostRegressor(**params).fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

## Hyperparameter Tunning

In [None]:
!pip install optuna

In [None]:
import optuna

def objective(trial):
    # Taken from: https://deepnote.com/app/svpino/Tuning-Hyperparameters-with-Optuna-ea1a123d-8d2f-4e20-8f22-95f07470d557
    params = {
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 100),
        # 'subsample': trial.suggest_float('subsample', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 10, 50),
        'depth': trial.suggest_int('depth', 1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15),
    }
    
    model = CatBoostRegressor(
        iterations=1000,
        eval_metric=metrics.RMSE(),
        random_seed=42,
        verbose=False,
        objective=metrics.RMSE(),
        task_type='GPU', # training on GPU
        use_best_model=True,
        od_type='Iter',
        od_wait=20,
        **params,
    )

    cv_data = cv(
        train_pool,
        model.get_params(),
        logging_level='Silent',
    )
    best_rmse = np.min(cv_data['test-RMSE-mean'])

    return best_rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Display the best hyperparameters found
print(f"Best trial: {study.best_trial.params}")

# Train the final model with the best parameters
best_params = study.best_trial.params
best_model = CatBoostRegressor(
    iterations=5000,
    eval_metric=metrics.RMSE(),
    random_seed=42,
    logging_level='Silent',
    objective=metrics.RMSE(),
    task_type='GPU', # training on GPU
    use_best_model=True,
    od_type='Iter',
    od_wait=20,
    **best_params,
)

# Train the model with early stopping
best_model.fit(train_pool, eval_set=val_pool)

In [None]:
model = CatBoostRegressor(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=3000,
    eval_metric=metrics.RMSE(),
    random_seed=42,
    verbose=False,
    objective=metrics.RMSE(),
    task_type='GPU',
    use_best_model=True,
)
cv_data = cv(Pool(X, y, cat_features=cat_names), model.get_params(), plot=True)

In [None]:
print('RMSE score: {}'.format(np.min(cv_data['test-RMSE-mean'])))

In [None]:
model.fit(train_pool, eval_set=val_pool)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 2))
preds_val = model.predict(X_val)
diff_val = preds_val - y_val
plt.hist(diff_val, bins=200)
diff_val.mean(), diff_val.std()

In [None]:
print('Model validation RMSE: {:.4}'.format(
    root_mean_squared_error(y_val * std + mean, preds_val * std + mean)
))

## Submissão

In [15]:
df_test = pd.read_csv("./data/submission_set_updated_v8.csv")
df_test.head()

Unnamed: 0,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_100,track_variation_ENR,average_vertical_rate_ARR_100,average_vertical_rate_DEP_100,...,Altitude_adep,Latitude_ades,Longitude_ades,Altitude_ades,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow
0,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,15.0,1122,,110.069323,50.723775,,1806.031452,...,312,45.726,5.091,251,-61,293.477205,-0.030154,11,17,
1,A333,H,bdeeef3a675587d530de70a25d7118d2,15.0,3205,98.664855,64.755647,93.023665,-926.465188,1432.940325,...,57,40.64,-73.779,4,-53,291.395141,-0.009004,6,1,
2,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,10.0,3965,26.478102,73.771503,5.682069,-1089.944203,1914.88,...,3,51.477,-0.461,25,22,43.036806,0.003095,12,13,
3,B38M,M,3922524069809ac4326134429751e26f,10.0,986,102.257838,176.128681,48.209089,-988.140379,1576.297901,...,55,38.282,-0.558,44,-11,178.644825,-0.006508,13,19,
4,A320,M,a73f82288988b79be490c6322f4c32ed,15.0,686,5.65319,169.820834,50.924346,-1239.918723,1726.27027,...,74,45.726,5.091,251,177,131.790949,0.149765,0,17,


In [16]:
X_test = df_test.drop('tow', axis=1)

In [17]:
y_pred = model.predict(X_test)
y_test = y_pred

In [18]:
y_test

array([ 68946.2642462 , 213064.74788114, 220121.91304719, ...,
        75229.24184834,  61912.40769668,  65203.63880688])

In [20]:
dft0 = pd.read_csv('./data/submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('catboost.csv', index=False)

In [21]:
!head catboost.csv

flight_id,tow
248753821,68946.26424620472
248753822,213064.7478811413
248754498,220121.91304718656
248757623,63407.81450780528
248763603,64560.268099067034
248755068,54698.5422926647
248754229,55945.19529199411
248754894,59733.24085073604
248754751,56641.6751755905


In [None]:
# !mc cp ./mlp.csv dc24/submissions/team_tiny_rainbow_v2_7ec66710-1eb8-478e-8976-584c090b6373.csv