In [10]:
# Importing the necessary libraries.

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

In [5]:
df = pd.read_csv('flight_fare_dataset/Cleaned_dataset.csv')

In [108]:
# Finding the unique values:
days = df['Journey_day'].unique()
print("Unique Journey Days:", days)

classes = df['Class'].unique()
print("Unique Classes:", classes)  

stops = df['Total_stops'].unique()
print("Unique Total Stops:", stops)

arrivals = df['Arrival'].unique()
print("Unique arrivals:", arrivals) 

departures = df['Departure'].unique()
print("Unique Departure:", departures) 

Unique Journey Days: ['Monday' 'Tuesday' 'Wednesday' 'Thursday' 'Friday' 'Saturday' 'Sunday']
Unique Classes: ['Economy' 'Premium Economy' 'Business' 'First']
Unique Total Stops: ['non-stop' '1-stop' '2+-stop']
Unique arrivals: ['After 6 PM' 'Before 6 AM' '6 AM - 12 PM' '12 PM - 6 PM']
Unique Departure: ['After 6 PM' 'Before 6 AM' '12 PM - 6 PM' '6 AM - 12 PM']


In [6]:
# Considering only the necessary columns for model training.

df = df[[
    "Journey_day", 
    "Class", 
    "Total_stops", 
    "Duration_in_hours", "Arrival", 
    "Departure", "Fare"]]

df.head(10)

Unnamed: 0,Journey_day,Class,Total_stops,Duration_in_hours,Arrival,Departure,Fare
0,Monday,Economy,non-stop,2.0833,After 6 PM,After 6 PM,5335
1,Monday,Economy,non-stop,2.3333,Before 6 AM,After 6 PM,5899
2,Monday,Economy,non-stop,2.1667,Before 6 AM,After 6 PM,5801
3,Monday,Economy,non-stop,2.0833,After 6 PM,After 6 PM,5794
4,Monday,Economy,non-stop,2.1667,After 6 PM,After 6 PM,5955
5,Monday,Economy,non-stop,2.25,After 6 PM,After 6 PM,5955
6,Monday,Economy,non-stop,2.25,Before 6 AM,After 6 PM,5955
7,Monday,Economy,non-stop,2.25,After 6 PM,After 6 PM,5899
8,Monday,Economy,1-stop,14.3333,6 AM - 12 PM,After 6 PM,5829
9,Monday,Economy,non-stop,2.0833,After 6 PM,After 6 PM,5899


In [None]:
# Dataset information.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452088 entries, 0 to 452087
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Journey_day        452088 non-null  object 
 1   Class              452088 non-null  object 
 2   Total_stops        452088 non-null  object 
 3   Duration_in_hours  452088 non-null  float64
 4   Arrival            452088 non-null  object 
 5   Departure          452088 non-null  object 
 6   Fare               452088 non-null  int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 24.1+ MB


In [7]:
# Manual mapping (Prioritization) of categorical columns
stops_priority = {
    'non-stop': 2,      # Best - most desirable
    '1-stop': 1,        # Medium
    '2+-stop': 0        # Worst - least desirable
}
df['Total_stops'] = df['Total_stops'].map(stops_priority)

# Class encoding (Economy = cheapest, Business = most expensive)
class_priority = {
    'Economy': 1,
    'Premium Economy': 2,
    'First': 3,
    'Business': 4
}
df['Class'] = df['Class'].map(class_priority)

priority = {
    'Before 6 AM': 0,        # Cheapest (very early morning)
    'After 6 PM': 1,         # Second cheapest (evening/red-eye)  
    '6 AM - 12 PM': 2,       # Third cheapest (morning/mid-morning)
    '12 PM - 6 PM': 3        # Most expensive (afternoon/peak hours)
}
df['Departure'] = df['Departure'].map(priority)
df['Arrival'] = df['Arrival'].map(priority)

days_priority = {
    'Tuesday': 0,        # Cheapest
    'Wednesday': 0,      # Cheapest
    'Saturday': 1,       # Second cheapest
    'Monday': 2,         # Medium price
    'Thursday': 2,       # Medium price
    'Friday': 3,         # Most expensive
    'Sunday': 3          # Most expensive
}
df['Journey_day'] = df['Journey_day'].map(days_priority)


In [112]:
# After changing the categorical columns to numerical, checking the unique values again:
days = df['Journey_day'].unique()
print("Unique Journey Days:", days)

classes = df['Class'].unique()
print("Unique Classes:", classes)  

stops = df['Total_stops'].unique()
print("Unique Total Stops:", stops)

arrivals = df['Arrival'].unique()
print("Unique arrivals:", arrivals) 

departures = df['Departure'].unique()
print("Unique Departure:", departures) 

Unique Journey Days: [2 0 3 1]
Unique Classes: [1 2 4 3]
Unique Total Stops: [2 1 0]
Unique arrivals: [1 0 2 3]
Unique Departure: [1 0 3 2]


In [8]:
# Defining features and target variable
X = df.drop('Fare', axis=1)
y = df['Fare']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Training the XGBoost Regressor with optimized hyperparameters

model = XGBRegressor(
    n_estimators=600,         # slightly more trees
    learning_rate=0.08,       # smaller LR for smoother learning
    max_depth=9,              # a bit deeper to capture non-linearities
    min_child_weight=3,       # prevents overfitting on small splits
    subsample=0.85,           # better generalization
    colsample_bytree=0.85,    # feature sampling
    reg_alpha=5,              # L1 regularization
    reg_lambda=10,            # L2 regularization
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred_xgb = model.predict(X_test)

In [127]:
print("Final model trained.")
print("XGBoost Model R²:", r2_score(y_test, y_pred_xgb))
print("XGBoost Model MAE:", mean_absolute_error(y_test, y_pred_xgb))

Final model trained.
XGBoost Model R²: 0.874138593673706
XGBoost Model MAE: 4525.5849609375


In [None]:
# Save the model
joblib.dump(model, "model.pkl")
print("✅ Model saved successfully!")

✅ Model saved successfully!


In [11]:
# ✅ Save feature column names used for training
feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, "feature_columns.pkl")

print("✅ Feature columns saved successfully as 'feature_columns.pkl'")

✅ Feature columns saved successfully as 'feature_columns.pkl'
