In [15]:
#Importing all the libraries

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from sklearn.feature_selection import SelectKBest, f_regression

In [16]:
data = pd.read_csv('/Users/skirar/Downloads/Projects/train.csv')
print(data.shape)
print(data.isnull().sum())
print(data.info())
print(data.describe())

(45000, 19)
Flight_ID                0
Airline               3573
Departure_City         340
Arrival_City           186
Distance                91
Departure_Time           0
Arrival_Time             0
Duration                 0
Aircraft_Type           43
Number_of_Stops          0
Day_of_Week            225
Month_of_Travel        267
Holiday_Season           0
Demand                 317
Weather_Conditions     302
Passenger_Count          0
Promotion_Type         403
Fuel_Price              90
Flight_Price             0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Flight_ID           45000 non-null  object 
 1   Airline             41427 non-null  object 
 2   Departure_City      44660 non-null  object 
 3   Arrival_City        44814 non-null  object 
 4   Distance            44909 non-null  float64
 5   Depar

In [17]:
# Handling missing values
categorical_columns = ['Airline', 'Departure_City', 'Arrival_City', 'Demand', 'Aircraft_Type', 'Day_of_Week', 'Month_of_Travel','Holiday_Season','Weather_Conditions']
for column in categorical_columns:
    data[column].fillna('Missing', inplace=True)

numerical_columns = ['Distance', 'Fuel_Price']

for column in numerical_columns:
    data[column].fillna(data[column].median(), inplace=True)

data['Promotion_Type_Missing'] = data['Promotion_Type'].isnull().astype(int)

data['Fuel_Price'].fillna(data['Fuel_Price'].median(), inplace=True)

data.isnull().sum()

Flight_ID                   0
Airline                     0
Departure_City              0
Arrival_City                0
Distance                    0
Departure_Time              0
Arrival_Time                0
Duration                    0
Aircraft_Type               0
Number_of_Stops             0
Day_of_Week                 0
Month_of_Travel             0
Holiday_Season              0
Demand                      0
Weather_Conditions          0
Passenger_Count             0
Promotion_Type            403
Fuel_Price                  0
Flight_Price                0
Promotion_Type_Missing      0
dtype: int64

In [18]:
# Combine 'Departure_City' and 'Arrival_City' to create a set of unique cities
unique_cities = set(data['Departure_City'].unique()) | set(data['Arrival_City'].unique())

In [19]:
# Create a mapping dictionary where each unique city is assigned a unique integer label
city_mapping = {city: label for label, city in enumerate(unique_cities)}
data

Unnamed: 0,Flight_ID,Airline,Departure_City,Arrival_City,Distance,Departure_Time,Arrival_Time,Duration,Aircraft_Type,Number_of_Stops,Day_of_Week,Month_of_Travel,Holiday_Season,Demand,Weather_Conditions,Passenger_Count,Promotion_Type,Fuel_Price,Flight_Price,Promotion_Type_Missing
0,F1,Airline B,Missing,Greenshire,8286.0,8:23,20:19,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93,0
1,F2,Airline C,Leonardland,New Stephen,2942.0,20:28,1:45,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13,0
2,F3,Airline B,South Dylanville,Port Ambermouth,2468.0,11:30,15:54,4.41,Boeing 787,1,Sunday,September,Summer,High,Cloudy,131,,0.52,442.17,1
3,F4,Missing,Blakefort,Crosbyberg,3145.0,20:24,1:21,4.96,Boeing 787,0,Sunday,February,Fall,Low,Cloudy,170,Discount,0.71,394.42,0
4,F5,Airline B,Michaelport,Onealborough,5558.0,21:59,6:04,8.09,Boeing 737,1,Thursday,January,,Missing,Clear,181,,1.09,804.35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,F44996,Airline B,Port Donaldland,Maryshire,3884.0,21:49,3:31,5.70,Boeing 787,1,Sunday,May,Fall,Low,Clear,263,Discount,0.69,417.18,0
44996,F44997,Airline A,East Patriciafurt,East Vickiberg,9191.0,22:50,11:48,12.97,Airbus A320,1,Friday,December,Fall,Low,Snow,168,Discount,0.74,687.03,0
44997,F44998,Airline C,Monicaton,West Johnburgh,6757.0,2:33,11:21,8.81,Boeing 777,1,Wednesday,October,Fall,Low,Snow,244,Special Offer,0.83,547.39,0
44998,F44999,Airline B,Turnerburgh,Gabrielville,3587.0,13:25,19:37,6.21,Boeing 737,0,Saturday,June,,Medium,Cloudy,121,Special Offer,0.60,487.67,0


In [20]:
# Apply the mapping to both 'Departure_City' and 'Arrival_City'
data['Departure_City'] = data['Departure_City'].map(city_mapping)
data['Arrival_City'] = data['Arrival_City'].map(city_mapping)

data['Flight_ID'] = data['Flight_ID'].str.extract('(\d+)').astype(int)

data.head()

Unnamed: 0,Flight_ID,Airline,Departure_City,Arrival_City,Distance,Departure_Time,Arrival_Time,Duration,Aircraft_Type,Number_of_Stops,Day_of_Week,Month_of_Travel,Holiday_Season,Demand,Weather_Conditions,Passenger_Count,Promotion_Type,Fuel_Price,Flight_Price,Promotion_Type_Missing
0,1,Airline B,5515,25601,8286.0,8:23,20:19,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93,0
1,2,Airline C,23810,22822,2942.0,20:28,1:45,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13,0
2,3,Airline B,11768,22646,2468.0,11:30,15:54,4.41,Boeing 787,1,Sunday,September,Summer,High,Cloudy,131,,0.52,442.17,1
3,4,Missing,32366,17998,3145.0,20:24,1:21,4.96,Boeing 787,0,Sunday,February,Fall,Low,Cloudy,170,Discount,0.71,394.42,0
4,5,Airline B,14736,33158,5558.0,21:59,6:04,8.09,Boeing 737,1,Thursday,January,,Missing,Clear,181,,1.09,804.35,0


In [21]:
# Extract hour of departure and arrival from 'Departure_Time' and 'Arrival_Time'
# Convert 'Departure_Time' and 'Arrival_Time' to datetime objects
data['Departure_Time'] = pd.to_datetime(data['Departure_Time'])
data['Arrival_Time'] = pd.to_datetime(data['Arrival_Time'])

In [22]:
# Extract hour and minute features from 'Departure_Time' and 'Arrival_Time'
data['Departure_Hour'] = data['Departure_Time'].dt.hour
data['Departure_Minute'] = data['Departure_Time'].dt.minute
data['Arrival_Hour'] = data['Arrival_Time'].dt.hour
data['Arrival_Minute'] = data['Arrival_Time'].dt.minute

In [23]:
# Drop the original 'Departure_Time' and 'Arrival_Time' columns
data.drop(['Departure_Time', 'Arrival_Time'], axis=1, inplace=True)

In [24]:
# Convert categorical variables to numerical using label encoding
label_encoders = {}
categorical_columns = ['Aircraft_Type', 'Day_of_Week', 'Month_of_Travel', 'Demand','Holiday_Season', 'Weather_Conditions', 'Promotion_Type']

In [25]:
data = pd.get_dummies(data, columns=['Airline'], drop_first=True)

for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [26]:
# Scaling/Normalizing numerical features
scaler = StandardScaler()
numeric_columns = ['Distance', 'Duration', 'Number_of_Stops', 'Fuel_Price', 'Passenger_Count']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Flight_ID               45000 non-null  int64  
 1   Departure_City          45000 non-null  int64  
 2   Arrival_City            45000 non-null  int64  
 3   Distance                45000 non-null  float64
 4   Duration                45000 non-null  float64
 5   Aircraft_Type           45000 non-null  int64  
 6   Number_of_Stops         45000 non-null  float64
 7   Day_of_Week             45000 non-null  int64  
 8   Month_of_Travel         45000 non-null  int64  
 9   Holiday_Season          45000 non-null  int64  
 10  Demand                  45000 non-null  int64  
 11  Weather_Conditions      45000 non-null  int64  
 12  Passenger_Count         45000 non-null  float64
 13  Promotion_Type          45000 non-null  int64  
 14  Fuel_Price              45000 non-null

In [27]:
# Feature Selection using SelectKBest and f_regression
X = data.drop(['Flight_Price'], axis=1)
y = data['Flight_Price']

selector = SelectKBest(score_func=f_regression, k=10)  # You can adjust 'k' as needed
X_selected = selector.fit_transform(X, y)
selected_feature_indices = selector.get_support(indices=True)

In [28]:
# Get the names of selected features
selected_features = X.columns[selected_feature_indices]

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.3, random_state=42)

In [30]:
# Initialize and train the XGBoost Regressor model
xgb_regressor = XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}
scorer = make_scorer(mean_absolute_error, greater_is_better=False)
random_search = RandomizedSearchCV(
    xgb_regressor,
    param_distributions=param_grid,
    scoring=scorer,
    cv=5,  # You can adjust the number of cross-validation folds
    n_iter=50,  # Adjust the number of iterations as needed
    verbose=0,  # Increase verbosity for progress updates
    n_jobs=-1,  # Utilize all available CPU cores
    random_state=42,  # Set a random seed for reproducibility
)

In [31]:
# Fit the random search to the data
random_search.fit(X_train, y_train)

best_xgb_model = random_search.best_estimator_
best_hyperparameters = random_search.best_params_

print(best_xgb_model)
print(best_hyperparameters)

y_pred = best_xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Best Model MAE: {mae}')
print(f'Best Model RMSE: {rmse}')

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
{'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best Model MAE: 9.809768711841725
Best Model RMSE: 13.186545801721415


In [32]:
test = pd.read_csv('/Users/skirar/Downloads/Projects/test.csv')

In [17]:
# Handle missing values
categorical_columns = ['Airline', 'Departure_City', 'Arrival_City', 'Demand', 'Aircraft_Type', 'Day_of_Week', 'Month_of_Travel','Holiday_Season','Weather_Conditions']

for column in categorical_columns:
    test[column].fillna('Missing', inplace=True)

numerical_columns = ['Distance', 'Fuel_Price']

for column in numerical_columns:
    test[column].fillna(test[column].median(), inplace=True)

data['Promotion_Type_Missing'] = test['Promotion_Type'].isnull().astype(int)

data['Fuel_Price'].fillna(test['Fuel_Price'].median(), inplace=True)

unique_cities = set(test['Departure_City'].unique()) | set(test['Arrival_City'].unique())
city_mapping = {city: label for label, city in enumerate(unique_cities)}
test['Departure_City'] = test['Departure_City'].map(city_mapping)
test['Arrival_City'] = test['Arrival_City'].map(city_mapping)
test['Flight_ID'] = test['Flight_ID'].str.extract('(\d+)').astype(int)

In [33]:
# Extract hour of departure and arrival from 'Departure_Time' and 'Arrival_Time'
# Convert 'Departure_Time' and 'Arrival_Time' to datetime objects
test['Departure_Time'] = pd.to_datetime(test['Departure_Time'])
test['Arrival_Time'] = pd.to_datetime(test['Arrival_Time'])

In [34]:
# Extract hour and minute features from 'Departure_Time' and 'Arrival_Time'
test['Departure_Hour'] = test['Departure_Time'].dt.hour
test['Departure_Minute'] = test['Departure_Time'].dt.minute
test['Arrival_Hour'] = test['Arrival_Time'].dt.hour
test['Arrival_Minute'] = test['Arrival_Time'].dt.minute

In [35]:
# Drop the original 'Departure_Time' and 'Arrival_Time' columns
test.drop(['Departure_Time', 'Arrival_Time'], axis=1, inplace=True)

In [36]:
# Convert categorical variables to numerical using label encoding
label_encoders = {}
categorical_columns = ['Aircraft_Type', 'Day_of_Week', 'Month_of_Travel', 'Demand','Holiday_Season', 'Weather_Conditions', 'Promotion_Type']

test = pd.get_dummies(test, columns=['Airline'], drop_first=True)

for column in categorical_columns:
    le = LabelEncoder()
    test[column] = le.fit_transform(test[column])
    label_encoders[column] = le

In [37]:
# Scaling/Normalizing numerical features
scaler = StandardScaler()
numeric_columns = ['Distance', 'Duration', 'Number_of_Stops', 'Fuel_Price', 'Passenger_Count']
test[numeric_columns] = scaler.fit_transform(test[numeric_columns])
test = test[['Distance', 'Duration', 'Aircraft_Type', 'Number_of_Stops',
       'Day_of_Week', 'Month_of_Travel', 'Holiday_Season', 'Demand',
       'Weather_Conditions', 'Fuel_Price']]
test.head()

Unnamed: 0,Distance,Duration,Aircraft_Type,Number_of_Stops,Day_of_Week,Month_of_Travel,Holiday_Season,Demand,Weather_Conditions,Fuel_Price
0,-0.904088,-0.695995,2,0.448898,2,1,3,2,0,0.485562
1,1.261435,1.195769,1,0.448898,4,0,1,0,0,0.977653
2,0.345752,0.530037,3,0.448898,3,4,1,1,2,-1.089127
3,0.908161,0.716803,1,-0.901578,4,7,1,1,2,0.141099
4,-1.656133,-1.79852,0,0.448898,2,1,3,0,1,1.272907


In [38]:
y_pred = best_xgb_model.predict(test)

In [40]:
# Create a DataFrame with 'Flight_ID' and 'Predicted_Price' columns
submission_df = pd.DataFrame({'Flight_ID': test.index, 'Flight_Price': y_pred})

# Save the DataFrame to a CSV file
submission_df.to_csv('/Users/skirar/Downloads/Projects/submission.csv', index=False)