In [2]:
import pandas as pd
import numpy as np 


In [3]:
# Load the dataset to examine its structure
file_path = 'challenge_set.csv'
df = pd.read_csv(file_path)

# Show the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01T13:46:00Z,2022-01-01T15:04:56Z,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01T09:55:00Z,2022-01-01T19:37:56Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01T09:39:00Z,2022-01-01T19:08:13Z,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01T11:04:00Z,2022-01-01T19:32:13Z,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.0
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01T12:36:00Z,2022-01-01T13:44:32Z,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
# Dropping unnecessary columns
df_cleaned = df.drop(columns=['flight_id', 'callsign', 'date', 'actual_offblock_time', 'arrival_time', 'name_adep', 'name_ades'])

# Label encoding categorical columns
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'country_code_adep', 'country_code_ades']
encoder = LabelEncoder()

for col in categorical_cols:
    df_cleaned[col] = encoder.fit_transform(df_cleaned[col])

# Check for missing values and fill/remove them
df_cleaned = df_cleaned.fillna(df_cleaned.mean())  # Filling missing values with column mean

df_cleaned.head()

Unnamed: 0,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,67,36,76,33,4,1,20,61,18,321,54748.0
1,210,32,161,76,18,0,10,570,13,4193,185441.0
2,133,86,164,76,7,0,18,554,15,3770,230396.0
3,328,19,165,76,21,0,10,497,11,3607,157615.0
4,89,43,55,26,1,1,20,55,14,305,70318.447226


In [6]:
# Split the dataset into features (X) and target variable (y)
X = df_cleaned.drop(columns=['tow'])  # Features
y = df_cleaned['tow']  # Target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the cleaned and prepared dataset for features and target
X_train.head(), y_train.head()

(        adep  country_code_adep  ades  country_code_ades  aircraft_type  wtc  \
 21657    133                 86   112                 66              4    1   
 260497    57                 36   247                 30             11    1   
 112414    16                 11    13                 18              4    1   
 353828    22                 26   301                 13              0    1   
 20450     89                 43    20                 18              4    1   
 
         airline  flight_duration  taxiout_time  flown_distance  
 21657        18               55            15             291  
 260497        6              178            10            1375  
 112414       22               60            10             401  
 353828        3               62             6             361  
 20450        20              105            15             804  ,
 21657     54774.000000
 260497    66835.000000
 112414    61046.219978
 353828    64882.000000
 20450     64360.96

In [7]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [8]:
# LightGBM dataset creation
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'verbose': -1
}

# Train the model
lgbm_model = lgb.train(params, train_data, valid_sets=[train_data, test_data])

# Predict on test data
y_pred = lgbm_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

# Feature importance
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': lgbm_model.feature_importance()})
print(importance_df.sort_values(by='Importance', ascending=False))

RMSE: 4362.085203821082
             Feature  Importance
4      aircraft_type         852
9     flown_distance         501
0               adep         418
2               ades         353
6            airline         309
7    flight_duration         198
3  country_code_ades         111
1  country_code_adep         102
8       taxiout_time          90
5                wtc          66




In [10]:

import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data into DMatrix format required by XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
xgb_params = {
    'objective': 'reg:squarederror',  # for regression tasks
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 6,
  #  'n_estimators': 100
}

# Train the XGBoost model
xgb_model = xgb.train(xgb_params, dtrain, evals=[(dtest, 'test')], early_stopping_rounds=10, verbose_eval=False)

# Predict on test data
y_pred = xgb_model.predict(dtest)

# Evaluate the model using RMSE
rmse_xgb = mean_squared_error(y_test, y_pred, squared=False)

rmse_xgb



19406.553172535816