In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

# Scikit-learn and other ML libraries
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import itertools

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("final_data.csv")
data.head()

Unnamed: 0,Total_Stops,Price,Journey_day,Journey_month,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Total_Duration_minutes,Airline_Air Asia,...,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata
0,0,3897,24,3,22,20,1,10,170,False,...,True,False,False,False,False,False,False,True,False,False
1,2,7662,1,5,5,50,13,15,445,False,...,False,False,False,True,False,True,False,False,False,False
2,2,13882,9,6,9,25,4,25,1140,False,...,False,False,True,False,False,False,True,False,False,False
3,1,6218,12,5,18,5,23,30,325,False,...,False,False,False,True,False,True,False,False,False,False
4,1,13302,1,3,16,50,21,35,285,False,...,True,False,False,False,False,False,False,True,False,False


**Separating Independent Variables(X) and Dependent Variable(y)**

In [3]:
X = data.drop(columns = ["Price"])
y = np.log(data["Price"])

In [4]:
X.head()

Unnamed: 0,Total_Stops,Journey_day,Journey_month,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Total_Duration_minutes,Airline_Air Asia,Airline_Air India,...,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata
0,0,24,3,22,20,1,10,170,False,False,...,True,False,False,False,False,False,False,True,False,False
1,2,1,5,5,50,13,15,445,False,True,...,False,False,False,True,False,True,False,False,False,False
2,2,9,6,9,25,4,25,1140,False,False,...,False,False,True,False,False,False,True,False,False,False
3,1,12,5,18,5,23,30,325,False,False,...,False,False,False,True,False,True,False,False,False,False
4,1,1,3,16,50,21,35,285,False,False,...,True,False,False,False,False,False,False,True,False,False


In [5]:
y

0        8.267962
1        8.944028
2        9.538348
3        8.735204
4        9.495670
           ...   
10458    8.320448
10459    8.329658
10460    8.885856
10461    9.445254
10462    9.371864
Name: Price, Length: 10463, dtype: float64

In [6]:
X.columns

Index(['Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min',
       'Arrival_hour', 'Arrival_min', 'Total_Duration_minutes',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata'],
      dtype='object')

In [7]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
X = scaler.fit_transform(X)

# Check the scaled data
print(X)

[[-1.21467398  1.24453948 -1.46216991 ...  1.94980612 -0.26715175
  -0.19439681]
 [ 1.81311252 -1.47188969  0.25649366 ... -0.5128715  -0.26715175
  -0.19439681]
 [ 1.81311252 -0.52704476  1.11582544 ... -0.5128715  -0.26715175
  -0.19439681]
 ...
 [-1.21467398  1.59885633 -0.60283812 ...  1.94980612 -0.26715175
  -0.19439681]
 [-1.21467398 -1.47188969 -1.46216991 ...  1.94980612 -0.26715175
  -0.19439681]
 [ 1.81311252 -0.52704476  0.25649366 ... -0.5128715  -0.26715175
  -0.19439681]]


In [8]:
# Splitting the data into train test 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluate a model's performance on the given training and testing data.
    
    Parameters:
    model: A machine learning model instance (e.g., LinearRegression, RandomForest, etc.)
    X_train: Training features (numpy array, DataFrame, or similar structure)
    y_train: Training target values
    X_test: Testing features
    y_test: Testing target values

    Returns:
    tuple: Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R¬≤) score.
    """
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    predictions = model.predict(X_test)
    
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, predictions)
    
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, predictions)
    
    # Calculate R-squared score (R¬≤)
    r2 = r2_score(y_test, predictions)
    
    return mse, mae, r2

In [10]:
# Example usage with different models:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "KNeighbors Regressor": KNeighborsRegressor(n_neighbors=3),
    "DecisionTree Regressor": DecisionTreeRegressor(max_depth=8),
    "RandomForest Regressor": RandomForestRegressor(),
}

# Assuming X_train, y_train, X_test, y_test are already defined
for model_name, model in models.items():
    mse, mae, r2 = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"{model_name}:")
    print(f"  Mean Squared Error: {mse:.4f}")
    print(f"  Mean Absolute Error: {mae:.4f}")
    print(f"  R-squared Score: {r2:.4f}")
    print("-" * 40)

Linear Regression:
  Mean Squared Error: 11166008645733251144482816.0000
  Mean Absolute Error: 59642154310.7078
  R-squared Score: -42698715107248907086725120.0000
----------------------------------------
Ridge Regression:
  Mean Squared Error: 0.0770
  Mean Absolute Error: 0.2081
  R-squared Score: 0.7056
----------------------------------------
Lasso Regression:
  Mean Squared Error: 0.2616
  Mean Absolute Error: 0.4299
  R-squared Score: -0.0005
----------------------------------------
KNeighbors Regressor:
  Mean Squared Error: 0.0482
  Mean Absolute Error: 0.1492
  R-squared Score: 0.8159
----------------------------------------
DecisionTree Regressor:
  Mean Squared Error: 0.0435
  Mean Absolute Error: 0.1549
  R-squared Score: 0.8337
----------------------------------------
RandomForest Regressor:
  Mean Squared Error: 0.0360
  Mean Absolute Error: 0.1255
  R-squared Score: 0.8624
----------------------------------------


# Ada Boost

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create AdaBoost regressor
ada_reg = AdaBoostRegressor()

# Fit AdaBoost regressor to the training data
ada_reg.fit(X_train_imputed, y_train)

# Make predictions
y_pred = ada_reg.predict(X_test_imputed)

# Calculate R-squared score
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))

R2 score 0.6695605541813572
MAE 0.2444278810155481
MSE 0.08641219529989733


# Xg Boost

In [12]:
# Importing necessary libraries
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Creating an XGBRegressor model
model = xgb.XGBRegressor()

# Fitting the model to the training data with early stopping
model.fit(X_train, y_train)

# Making predictions on the testing data
y_pred = model.predict(X_test)

# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)


Mean Squared Error: 0.03093563946795036
Mean Absolute Error: 0.12369214975369949
R-squared Score: 0.8817023971401523


# GradientBoostingRegressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# Creating a Gradient Boosting regressor
regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Training the regressor on the training data
regressor.fit(X_train, y_train)

# Making predictions on the testing data
y_pred = regressor.predict(X_test)

# Calculating the mean squared error of the regressor
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculating the mean absolute error of the model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)

Mean Squared Error: 0.03406855771414054
Mean Absolute Error: 0.13976824881094183
R-squared Score: 0.8697221463726148


# XGBoost with Hyperparameter Tuning


In [14]:
# Importing necessary libraries
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Converting training and testing datasets to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Setting initial parameters for XGBoost regression
params = {
    'objective': 'reg:squarederror',  # objective function for regression
    'eta': 0.05,  # reduced learning rate for better performance
    'max_depth': 6,  # slightly increased depth for more complex patterns
    'min_child_weight': 3,  # adjusted for reducing overfitting
    'subsample': 0.8,  # higher subsampling ratio
    'colsample_bytree': 0.8,  # higher feature sampling ratio
    'seed': 42  # random seed
}

# Setting up early stopping and cross-validation
num_rounds = 1000  # set a high number for boosting rounds
early_stopping_rounds = 50  # stops training if no improvement for 50 rounds

# Cross-validation for finding the best number of boosting rounds
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_rounds,
    nfold=5,  # 5-fold cross-validation
    metrics="rmse",
    early_stopping_rounds=early_stopping_rounds,
    seed=42
)

# Optimal number of boosting rounds
optimal_rounds = len(cv_results)
print("Optimal Boosting Rounds:", optimal_rounds)

# Train the model with the optimal number of boosting rounds
model = xgb.train(params, dtrain, num_boost_round=optimal_rounds)

# Making predictions on the testing data
y_pred = model.predict(dtest)

# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)

Optimal Boosting Rounds: 382
Mean Squared Error: 0.029334917111118405
Mean Absolute Error: 0.122927390850675
R-squared Score: 0.8878235448168819


In [15]:
# Calculating the R-squared score for the training set
y_train_pred = model.predict(dtrain)
train_r2 = r2_score(y_train, y_train_pred)
print(f'The R-squared score on the training set is: {train_r2 * 100:.2f}%')

# The R-squared score for the test set has already been calculated
print(f'The R-squared score on the test set is: {r2 * 100:.2f}%')

The R-squared score on the training set is: 93.80%
The R-squared score on the test set is: 88.78%


In [16]:
import pickle

# Save the trained model to a pickle file
with open('Best_Model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved to 'Best_Model.pkl'")

Model saved to 'Best_Model.pkl'


In [17]:
# Save the scaler and encoder
pickle.dump(scaler, open("scaler.pkl", "wb"))

## üèÜ Best Model Selection

Among all the models evaluated, the **XGBoost Regressor** performed the best with the following hyperparameters:

| **Hyperparameter**       | **Value** |
|---------------------------|-----------|
| **Objective**            | `reg:squarederror` |
| **Learning Rate (`eta`)** | `0.05`    |
| **Maximum Depth (`max_depth`)** | `6`    |
| **Minimum Child Weight (`min_child_weight`)** | `3`    |
| **Subsample Ratio (`subsample`)** | `0.8`  |
| **Column Subsample Ratio (`colsample_bytree`)** | `0.8`  |
| **Random Seed**           | `42`      |

### üéØ Performance Metrics
- **R-squared Score**: `88.78%` (on the test set)

This configuration provided an **optimal balance between bias and variance**, making it the most suitable model for this dataset.

---

### ‚öôÔ∏è Model Fine-Tuning

- The model was **fine-tuned** using **cross-validation** to determine the optimal number of boosting rounds.
- After tuning, the **optimal number of boosting rounds** was set to **382**, based on **early stopping criteria** to prevent overfitting.

---

By leveraging these hyperparameters and fine-tuning techniques, the **XGBoost Regressor** achieved excellent performance for the dataset.

 dataset.
