In [1]:
version = 110

In [2]:
import pandas as pd
import numpy as np

## Loading Data

In [3]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [17]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster',
            'od_pair']

In [8]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)
dataset.drop('stall_vel_knots', axis=1, inplace=True)
dataset['od_pair'] = dataset['adep'] + '-' + dataset['ades']
dataset.head()
print(dataset['od_pair'].nunique())

4586


In [9]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

vlof_tas                            0.406441
tas_1                               0.406441
specific_energy_1                   0.406441
vlof_groundspeed                    0.406441
sqrd_tas_1                          0.406441
sqrd_vlof_tas                       0.406441
sqrd_tas_2                          0.372550
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
tas_3                               0.359277
sqrd_tas_10                         0.356763
specific_energy_10                  0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
specific_energy_9                   0.353705
tas_9                               0.353705
sqrd_tas_4                          0.353320
tas_4                               0.353320
specific_energy_4                   0.353320
specific_energy_8                   0.351376
tas_8     

In [10]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [8]:
eliminated_features = ['groundspeed_airspeed_ratio_ENR', 'temperature_9', 'wind_distance_flown_distance_ENR', 'average_humidity_DEP_40', 'vertical_rate_bins_ARR', 
        'groundspeed_flown_distance_ARR', 'arrival_quarter', 'offblock_year', 'arrival_year', 'offblock_to_arrival_day_diff', 'altitude_9', 'tas_1', 
        'is_arrival_weekend', 'adep_height_6', 'sqrd_vlof_tas', 'average_airspeed_ARR_100', 'adep_height_7', 'wind_distance_ARR_100', 'altitude_4', 
        'adep_height_1', 'groundspeed_airspeed_ratio_ARR', 'tas_8', 'specific_energy_4', 'temperature_bins_DEP', 'temperature_6', 'humidity_bins_DEP', 
        'altitude_5', 'adep_height_5', 'sqrd_tas_8', 'sqrd_tas_7', 'specific_energy_7', 'specific_energy_1', 'adep_height_4', 'sqrd_tas_6', 'tas_2', 
        'sqrd_tas_5', 'specific_energy_3', 'altitude_8', 'specific_energy_6', 'adep_height_8', 'vertical_rate_airspeed_ARR', 'altitude_2', 'sqrd_tas_1', 
        'sqrd_tas_3', 'specific_energy_8', 'sqrd_tas_9', 'temperature_8', 'groundspeed_airspeed_ratio_DEP', 'sqrd_tas_4', 'altitude_6', 
        'specific_energy_5', 'humidity_temperature_DEP', 'adep_height_2', 'altitude_7', 'adep_height_3', 'temperature_1', 'specific_energy_2', 
        'temperature_5', 'wind_distance_flown_distance_ARR', 'arrival_month', 'temperature_4', 'groundspeed_ARR_100', 'tas_4', 'arrival_minute', 
        'adep_height_9', 'altitude_groundspeed_ARR', 'altitude_3', 'temperature_7', 'airspeed_specific_energy_ENR', 'altitude_10', 'sqrd_tas_10', 
        'humidity_bins_ARR', 'specific_energy_9', 'sqrd_tas_2', 'temperature_2', 'tas_10', 'average_humidity_ENR', 'offblock_quarter', 
        'airspeed_specific_energy_DEP', 'wind_distance_flown_distance_DEP', 'tas_6', 'flown_distance_ARR_100', 'vertical_rate_airspeed_ratio_ARR', 
        'average_humidity_ARR_100', 'specific_energy_10', 'first_adep_height', 'tas_3', 'temperature_3', 'track_variation_ARR_100', 
        'is_offblock_rush_hour', 'average_temperature_ENR', 'is_arrival_rush_hour', 'average_altitude_ARR_100', 'specific_energy_ENR', 
        'groundspeed_ENR', 'is_offblock_weekend', 'Num_Engines', 'temperature_bins_ARR', 'average_temperature_ARR_100', 'kpi17_time', 
        'average_airspeed_DEP_40', 'wind_distance_ENR', 'offblock_minute', 'groundspeed_10NM', 'average_vertical_rate_ARR_100', 'vlof_tas', 
        'humidity_temperature_ARR']

In [11]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [21]:
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

X_imputed = dataset.drop('tow', axis=1)
X_imputed = X_imputed.drop(eliminated_features, axis=1)
selected_cat_names = [x for x in cat_names if x in X_imputed.columns]
X_imputed = X_imputed.drop(selected_cat_names, axis=1)

regressor = LinearRegression()

imputer = IterativeImputer(estimator=regressor, missing_values=np.nan, max_iter=5, verbose=2, imputation_order='roman', random_state=42)
# imputer = KNNImputer()
imputer.fit(X_imputed)

X_imputed = imputer.transform(X_imputed)

[IterativeImputer] Completing matrix with shape (527162, 83)
[IterativeImputer] Ending imputation round 1/5, elapsed time 226.46
[IterativeImputer] Change: 567069505.1329292, scaled tolerance: 17221679.503513314 
[IterativeImputer] Ending imputation round 2/5, elapsed time 455.61
[IterativeImputer] Change: 134237606.06333306, scaled tolerance: 17221679.503513314 
[IterativeImputer] Ending imputation round 3/5, elapsed time 676.09
[IterativeImputer] Change: 29579437.94665668, scaled tolerance: 17221679.503513314 
[IterativeImputer] Ending imputation round 4/5, elapsed time 902.09
[IterativeImputer] Change: 10042756.898145434, scaled tolerance: 17221679.503513314 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (527162, 83)
[IterativeImputer] Ending imputation round 1/4, elapsed time 7.80
[IterativeImputer] Ending imputation round 2/4, elapsed time 15.55
[IterativeImputer] Ending imputation round 3/4, elapsed time 23.29
[IterativeImput

In [13]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

vlof_tas                            0.406441
sqrd_tas_1                          0.406441
tas_1                               0.406441
specific_energy_1                   0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
tas_3                               0.359277
sqrd_tas_10                         0.356763
specific_energy_10                  0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
tas_9                               0.353705
specific_energy_9                   0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
sqrd_tas_8

In [24]:
pd.set_option('display.max_rows', None)

print(X_imputed)

pd.reset_option('display.max_rows')

[[ 1.80000000e+01  3.21000000e+02  6.92967951e-01 ...  1.29000000e+02
   2.77346455e+02  2.32363394e-01]
 [ 1.30000000e+01  4.19300000e+03  6.27793601e+00 ... -1.00000000e+00
   2.85530704e+02 -1.32540511e-04]
 [ 1.50000000e+01  3.77000000e+03  6.24592398e+00 ...  1.66000000e+02
   3.05643136e+02  2.42136938e-02]
 ...
 [ 1.40000000e+01  3.42600000e+03  1.96558129e+00 ...  0.00000000e+00
   2.96505920e+02  0.00000000e+00]
 [ 1.50000000e+01  3.35000000e+02  8.38468055e-01 ...  1.45000000e+02
   4.36724373e+01  2.55441541e-01]
 [ 1.00000000e+01  1.01400000e+03 -2.24273864e+00 ...  3.21000000e+02
   2.79850550e+02  1.79498509e-01]]


In [12]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [13]:
X = df.drop('tow', axis=1)
y = df.tow

In [17]:
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Create correlation matrix
# corr_matrix = X.corr(numeric_only=True).abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# # Find features with correlation greater than 0.98
# to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]
# print(to_drop)

# # Plotting the correlation matrix
# plt.figure(figsize=(12, 8))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Matrix of Challenge Set')
# plt.show()

# # # Drop features 
# # X.drop(to_drop, axis=1, inplace=True)

In [14]:
eliminated_features = ['groundspeed_airspeed_ratio_ENR', 'temperature_9', 'wind_distance_flown_distance_ENR', 'average_humidity_DEP_40', 'vertical_rate_bins_ARR', 
        'groundspeed_flown_distance_ARR', 'arrival_quarter', 'offblock_year', 'arrival_year', 'offblock_to_arrival_day_diff', 'altitude_9', 'tas_1', 
        'is_arrival_weekend', 'adep_height_6', 'sqrd_vlof_tas', 'average_airspeed_ARR_100', 'adep_height_7', 'wind_distance_ARR_100', 'altitude_4', 
        'adep_height_1', 'groundspeed_airspeed_ratio_ARR', 'tas_8', 'specific_energy_4', 'temperature_bins_DEP', 'temperature_6', 'humidity_bins_DEP', 
        'altitude_5', 'adep_height_5', 'sqrd_tas_8', 'sqrd_tas_7', 'specific_energy_7', 'specific_energy_1', 'adep_height_4', 'sqrd_tas_6', 'tas_2', 
        'sqrd_tas_5', 'specific_energy_3', 'altitude_8', 'specific_energy_6', 'adep_height_8', 'vertical_rate_airspeed_ARR', 'altitude_2', 'sqrd_tas_1', 
        'sqrd_tas_3', 'specific_energy_8', 'sqrd_tas_9', 'temperature_8', 'groundspeed_airspeed_ratio_DEP', 'sqrd_tas_4', 'altitude_6', 
        'specific_energy_5', 'humidity_temperature_DEP', 'adep_height_2', 'altitude_7', 'adep_height_3', 'temperature_1', 'specific_energy_2', 
        'temperature_5', 'wind_distance_flown_distance_ARR', 'arrival_month', 'temperature_4', 'groundspeed_ARR_100', 'tas_4', 'arrival_minute', 
        'adep_height_9', 'altitude_groundspeed_ARR', 'altitude_3', 'temperature_7', 'airspeed_specific_energy_ENR', 'altitude_10', 'sqrd_tas_10', 
        'humidity_bins_ARR', 'specific_energy_9', 'sqrd_tas_2', 'temperature_2', 'tas_10', 'average_humidity_ENR', 'offblock_quarter', 
        'airspeed_specific_energy_DEP', 'wind_distance_flown_distance_DEP', 'tas_6', 'flown_distance_ARR_100', 'vertical_rate_airspeed_ratio_ARR', 
        'average_humidity_ARR_100', 'specific_energy_10', 'first_adep_height', 'tas_3', 'temperature_3', 'track_variation_ARR_100', 
        'is_offblock_rush_hour', 'average_temperature_ENR', 'is_arrival_rush_hour', 'average_altitude_ARR_100', 'specific_energy_ENR', 
        'groundspeed_ENR', 'is_offblock_weekend', 'Num_Engines', 'temperature_bins_ARR', 'average_temperature_ARR_100', 'kpi17_time', 
        'average_airspeed_DEP_40', 'wind_distance_ENR', 'offblock_minute', 'groundspeed_10NM', 'average_vertical_rate_ARR_100', 'vlof_tas', 
        'humidity_temperature_ARR']

In [15]:
X.drop(eliminated_features, axis=1, inplace=True)

In [18]:
from catboost import CatBoostRegressor, Pool, metrics
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'learning_rate': 0.01, 
    'reg_lambda': 0.05357182104973179, 
    'random_strength': 20.10951792232919, 
    'depth': 9, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 12
}

selected_cat_names = [x for x in cat_names if x in X.columns]

train_pool = Pool(X_train, y_train, cat_features=selected_cat_names)
val_pool = Pool(X_val, y_val, cat_features=selected_cat_names)

model = CatBoostRegressor(
    iterations=100000,
    objective=metrics.RMSE(),
    eval_metric=metrics.RMSE(),
    random_seed=42,
    verbose=False,
    task_type='GPU',
    use_best_model=True,
    od_type='Iter',
    od_wait=50,
    **params,
)

In [19]:
model.fit(
    train_pool, eval_set=val_pool,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 52819.9356156	test: 52395.9552310	best: 52395.9552310 (0)	total: 183ms	remaining: 5h 4m 16s
100:	learn: 20391.7857673	test: 20238.2572989	best: 20238.2572989 (100)	total: 14.5s	remaining: 3h 59m 38s
200:	learn: 8859.4788780	test: 8807.4805689	best: 8807.4805689 (200)	total: 29.1s	remaining: 4h 43s
300:	learn: 5231.1910360	test: 5219.2913225	best: 5219.2913225 (300)	total: 44.5s	remaining: 4h 5m 39s
400:	learn: 4232.9382960	test: 4238.8712567	best: 4238.8712567 (400)	total: 59.6s	remaining: 4h 6m 40s
500:	learn: 3892.8845191	test: 3908.6677558	best: 3908.6677558 (500)	total: 1m 14s	remaining: 4h 7m 41s
600:	learn: 3705.1837379	test: 3729.1644679	best: 3729.1644679 (600)	total: 1m 30s	remaining: 4h 8m 39s
700:	learn: 3569.8409732	test: 3600.5029664	best: 3600.5029664 (700)	total: 1m 45s	remaining: 4h 8m 29s
800:	learn: 3469.6616316	test: 3506.8495596	best: 3506.8495596 (800)	total: 2m	remaining: 4h 7m 44s
900:	learn: 3379.5654306	test: 3423.1692545	best: 3423.1692545 (900)	tota

<catboost.core.CatBoostRegressor at 0x17b1d905be0>

In [None]:
model.save_model('catboost_train_v%d.cbm' % version, 'cbm')

In [None]:
y_pred = model.predict(X_val)
pd.DataFrame(data={'tow': y_pred}).to_csv('catboost_val_v%d.csv' % version, index=False)

In [None]:
def classify_aircraft(row):
    if row['Physical_Class_Engine'] == 'Turboprop' and row['wtc'] == 'M':
        return 'Medium Turbo Prop'
    elif row['Physical_Class_Engine'] == 'Jet' and row['wtc'] == 'M':
        return 'Medium Jet'
    elif row['Physical_Class_Engine'] == 'Jet' and row['wtc'] == 'H':
        return 'Heavy Jet'
    else:
        return None  # If no classification applies, return None

In [None]:
from sklearn.metrics import root_mean_squared_error

X_val_groups = X_val.copy()

X_val_groups['Aircraft_Class'] = X_val.apply(classify_aircraft, axis=1)

X_val_groups = X_val_groups.reset_index()

indices_m_prop = X_val_groups[X_val_groups['Aircraft_Class'] == 'Medium Turbo Prop'].index
indices_m_jet = X_val_groups[X_val_groups['Aircraft_Class'] == 'Medium Jet'].index
indices_h_jet = X_val_groups[X_val_groups['Aircraft_Class'] == 'Heavy Jet'].index

print('RMSE Medium Turbo Prop:', root_mean_squared_error(y_val.iloc[indices_m_prop], y_pred[indices_m_prop]))
print('RMSE Medium Jet:', root_mean_squared_error(y_val.iloc[indices_m_jet], y_pred[indices_m_jet]))
print('RMSE Heavy Jet:', root_mean_squared_error(y_val.iloc[indices_h_jet], y_pred[indices_h_jet]))

In [None]:
best_iteration = model.get_best_iteration()
best_iteration = 68973
model = CatBoostRegressor(
    iterations=best_iteration,
    objective=metrics.RMSE(),
    random_seed=42,
    verbose=False,
    task_type='GPU',
    **params,
)
model.fit(
    X, y,
    cat_features=selected_cat_names, 
    plot=True,
    verbose=100
)

In [None]:
model.save_model('catboost_all_v%d.cbm' % version, 'cbm')

In [None]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [None]:
import matplotlib

feature_im_df = pd.DataFrame({
    "feature": X.columns,
    "importance": feature_importances
})

feature_im_df = feature_im_df.sort_values(by="importance", ascending=False)

sns.set(rc={'text.usetex' : True, 'text.latex.preamble': '\\usepackage{libertine}'})
plt.figure(figsize=(10, 6))
plt.figure()
sns.barplot(data = feature_im_df[:20], x='importance', y='feature', palette="viridis")

plt.title("CatBoost feature importance")
plt.xlabel("Importance")
plt.ylabel("feature")
plt.tight_layout()
plt.savefig('catboost_feature_importance.pdf', bbox_inches='tight')
plt.show()

In [None]:
model = CatBoostRegressor(**params).fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

## Generating Submission

In [20]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
df_test.head()

Unnamed: 0,callsign,adep,ades,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_40,...,Longitude_ades,Altitude_ades,actual_distance,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow,od_pair
0,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,LFLL,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,15.0,1122,1.668989,1.079187,...,5.091,251,2022.915548,-61,293.477205,-0.030154,11,17,63852.0,LTFJ-LFLL
1,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,KJFK,A333,H,bdeeef3a675587d530de70a25d7118d2,15.0,3205,1.766098,1.147364,...,-73.779,4,5886.43037,-53,291.395141,-0.009004,6,1,63852.0,EBBR-KJFK
2,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,EGLL,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,10.0,3965,6.253309,1.292737,...,-0.461,25,7108.920003,22,43.036806,0.003095,12,13,63852.0,KMIA-EGLL
3,35f7721f68bf85128195547ae38b0f04,EBBR,LEAL,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,9.0,802,1.775667,0.905718,...,-0.558,44,1458.405355,-13,197.753476,-0.008914,6,19,63852.0,EBBR-LEAL
4,eb56918bee9bc5204624186b9bcc4391,LSZH,LFPG,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,11.0,292,1.200644,1.204058,...,2.55,120,476.291487,-312,293.398537,-0.655061,2,6,63852.0,LSZH-LFPG


In [21]:
X_test = df_test.drop('tow', axis=1)

In [22]:
X_test = X_test.loc[:, X_test.columns.isin(X.columns)]

In [23]:
y_pred = model.predict(X_test)
y_test = y_pred
print(y_test)

[ 70449.18692723 213750.6234686  220596.47258476 ... 195446.41940041
  42989.78501552  62987.66009545]


In [24]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('catboost.csv', index=False)

In [25]:
print((df_test['aircraft_type'] == submission_set_updated['aircraft_type']).mean())

1.0


In [26]:
submission_dataset = df_test
submission_dataset['tow'] = y_pred
pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows')
print((submission_dataset['tow'] > submission_dataset['MTOW_kg']).sum())
mask = (submission_dataset['tow'] > submission_dataset['MTOW_kg'])
submission_dataset.loc[mask, 'tow'] = submission_dataset.loc[mask, 'MTOW_kg']
print((submission_dataset['tow'] > submission_dataset['MTOW_kg']).sum())

75
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_dataset['tow'] = y_pred


In [27]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = submission_dataset['tow']
dft0[['flight_id', 'tow']].to_csv('catboost_submission_v%d.csv' % version, index=False)