In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
#pip install pandas

In [3]:
#pip install scikit-learn==1.5.2 xgboost==1.7.6

In [4]:
import xgboost
import sklearn
print("XGBoost version:", xgboost.__version__)
print("Scikit-learn version:", sklearn.__version__)


XGBoost version: 1.7.6
Scikit-learn version: 1.5.2


In [5]:
#pip install openpyxl


In [6]:
## Step 1: loading and cleaning Functions
def load_and_clean_data(train_path, test_path):
    """Load and clean training and testing data."""
    def clean_data(df):
        df = df.drop_duplicates()
        df = df.fillna(method='ffill')  # Forward fill for missing values
        return df

    train_data = pd.read_excel(train_path)
    test_data = pd.read_excel(test_path)
    


    train_data = clean_data(train_data)
    test_data = clean_data(test_data)
    
    return train_data, test_data

In [7]:
train_data_path = r'C:\Users\user\Downloads\archive\Data_Train.xlsx'
test_data_path = r'C:\Users\user\Downloads\archive\Test_set.xlsx'
data_train, data_test = load_and_clean_data(train_data_path, test_data_path)


  df = df.fillna(method='ffill')  # Forward fill for missing values


In [8]:
data_train

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [9]:
data_test.duplicated().sum()

0

In [10]:
data_test.isnull().values.any() 
#True : at least one null value is present.

False

In [11]:
data_train.duplicated().sum()

0

In [12]:
data_train.isnull().values.any()

False

In [13]:
# -------------------------------------------
# Step 2: Preprocess Functions
# -------------------------------------------
def preprocess_total_stops(value):
    value = str(value)
    if 'non-stop' in value:
        return 0
    else:
        try:
            return int(value.split()[0])
        except (ValueError, IndexError):
            return 0  

In [14]:
def preprocess_duration(value):
    value = str(value)
    hours = minutes = 0
    if 'h' in value:
        hours = int(value.split('h')[0].strip())
    if 'm' in value:
        minutes = int(value.split('h')[-1].replace('m', '').strip())
    return hours * 60 + minutes

In [15]:
def feature_engineering(df):
    # Apply preprocessing steps on the DataFrame
    # Total Stops and Duration
    df['Total_Stops'] = df['Total_Stops'].apply(preprocess_total_stops)
    df['Duration'] = df['Duration'].apply(preprocess_duration)
    # Extract Date Features
    df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], dayfirst=True)
    df['Day'] = df['Date_of_Journey'].dt.day
    df['Month'] = df['Date_of_Journey'].dt.month
    # Extract Time Features
    df['Dep_Hour'] = pd.to_datetime(df['Dep_Time'], format='%H:%M', errors='coerce').dt.hour
    df['Dep_Minute'] = pd.to_datetime(df['Dep_Time'], format='%H:%M', errors='coerce').dt.minute
    df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M', errors='coerce').dt.hour
    df['Arrival_Minute'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M', errors='coerce').dt.minute
    # Drop unnecessary columns
    drop_cols = ['Route', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time']
    df = df.drop(columns=drop_cols)
    return df

In [16]:
# Feature Engineering
data_train = feature_engineering(data_train)
data_test = feature_engineering(data_test)


In [17]:
data_train

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Day,Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,170,0,No info,3897,24,3,22,20,,
1,Air India,Kolkata,Banglore,445,2,No info,7662,1,5,5,50,13.0,15.0
2,Jet Airways,Delhi,Cochin,1140,2,No info,13882,9,6,9,25,,
3,IndiGo,Kolkata,Banglore,325,1,No info,6218,12,5,18,5,23.0,30.0
4,IndiGo,Banglore,New Delhi,285,1,No info,13302,1,3,16,50,21.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,150,0,No info,4107,9,4,19,55,22.0,25.0
10679,Air India,Kolkata,Banglore,155,0,No info,4145,27,4,20,45,23.0,20.0
10680,Jet Airways,Banglore,Delhi,180,0,No info,7229,27,4,8,20,11.0,20.0
10681,Vistara,Banglore,New Delhi,160,0,No info,12648,1,3,11,30,14.0,10.0


In [18]:
data_test

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Day,Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute
0,Jet Airways,Delhi,Cochin,655,1,No info,6,6,17,30,,
1,IndiGo,Kolkata,Banglore,240,1,No info,12,5,6,20,10.0,20.0
2,Jet Airways,Delhi,Cochin,1425,1,In-flight meal not included,21,5,19,15,,
3,Multiple carriers,Delhi,Cochin,780,1,No info,21,5,8,0,21.0,0.0
4,Air Asia,Banglore,Delhi,170,0,No info,24,6,23,55,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2666,Air India,Kolkata,Banglore,1435,1,No info,6,6,20,30,,
2667,IndiGo,Kolkata,Banglore,155,0,No info,27,3,14,20,16.0,55.0
2668,Jet Airways,Delhi,Cochin,395,1,No info,6,3,21,50,,
2669,Air India,Delhi,Cochin,915,1,No info,6,3,4,0,19.0,15.0


In [19]:
X = data_train.drop(columns=['Price'])
y = data_train['Price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Day,Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute
6615,Air Asia,Delhi,Cochin,660,1,No info,9,3,20,10,,
6729,Jet Airways,Kolkata,Banglore,395,1,No info,18,5,17,0,23.0,35.0
3475,Jet Airways,Kolkata,Banglore,850,1,In-flight meal not included,18,5,8,25,22.0,35.0
8977,IndiGo,Delhi,Cochin,860,1,No info,15,5,6,40,21.0,0.0
3494,Jet Airways,Banglore,Delhi,180,0,In-flight meal not included,6,5,8,20,11.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5799,IndiGo,Banglore,Delhi,170,0,No info,3,6,8,30,11.0,20.0
5247,Jet Airways,Banglore,New Delhi,860,1,In-flight meal not included,21,3,7,0,21.0,20.0
5452,IndiGo,Delhi,Cochin,300,1,No info,3,4,10,35,15.0,35.0
861,IndiGo,Delhi,Cochin,195,0,No info,27,4,5,35,8.0,50.0


In [21]:
y_train

6615     8327
6729    14151
3475    10844
8977     7191
3494     4544
        ...  
5799     4823
5247     7832
5452     5073
861      6015
7367     3543
Name: Price, Length: 8370, dtype: int64

In [22]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [23]:
categorical_cols

Index(['Airline', 'Source', 'Destination', 'Additional_Info'], dtype='object')

In [24]:
numerical_cols

Index(['Duration', 'Total_Stops', 'Day', 'Month', 'Dep_Hour', 'Dep_Minute',
       'Arrival_Hour', 'Arrival_Minute'],
      dtype='object')

In [25]:
# Build Pipeline
def build_pipeline(numerical_cols, categorical_cols):
    preprocessor = ColumnTransformer(transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])
    return preprocessor

In [26]:
preprocessor = build_pipeline(numerical_cols, categorical_cols)


In [27]:
print(preprocessor)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Duration', 'Total_Stops', 'Day', 'Month', 'Dep_Hour', 'Dep_Minute',
       'Arrival_Hour', 'Arrival_Minute'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['Airline', 'Source', 'Destination', 'Additional_Info'], dtype='object'))])


In [28]:
# Model Training and Fine-Tuning

def train_and_evaluate_models(X_train, y_train, X_valid, y_valid, preprocessor):
    """Train models, perform fine-tuning, and evaluate performance."""
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42)
    }

    param_grid_rf = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20],
    }

    param_grid_xgb = {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [6, 10],
    }

    model_performance = {}
    best_model = None
    best_score = -np.inf

    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Fine-tune Random Forest and XGBoost
        if name == 'Random Forest':
            grid_search = GridSearchCV(pipeline, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
        elif name == 'XGBoost':
            grid_search = GridSearchCV(pipeline, param_grid_xgb, cv=3, scoring='r2', n_jobs=-1)
        else:
            grid_search = pipeline

        # Train the model
        if name in ['Random Forest', 'XGBoost']:
            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
        else:
            pipeline.fit(X_train, y_train)
            best_pipeline = pipeline

        # Predict and Evaluate
        y_pred = best_pipeline.predict(X_valid)
        mae = mean_absolute_error(y_valid, y_pred)
        mse = mean_squared_error(y_valid, y_pred)
        r2 = r2_score(y_valid, y_pred)

        model_performance[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}
        print(f"{name} Performance:")
        print(f"  MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.4f}")

        if r2 > best_score:
            best_score = r2
            best_model = best_pipeline

    return best_model, model_performance
 

In [29]:
best_model, performance = train_and_evaluate_models(X_train, y_train, X_valid, y_valid, preprocessor)

Linear Regression Performance:
  MAE: 1749.86, MSE: 6766243.42, R2: 0.6757
Random Forest Performance:
  MAE: 663.34, MSE: 2241617.97, R2: 0.8926
XGBoost Performance:
  MAE: 686.94, MSE: 2010889.47, R2: 0.9036


In [30]:
performance

{'Linear Regression': {'MAE': 1749.8632154721267,
  'MSE': 6766243.419808761,
  'R2': 0.675708468969877},
 'Random Forest': {'MAE': 663.3431823932001,
  'MSE': 2241617.96612966,
  'R2': 0.8925640599785925},
 'XGBoost': {'MAE': 686.938941631144,
  'MSE': 2010889.4689073535,
  'R2': 0.9036223888397217}}

In [31]:
best_model

In [32]:
pipeline_path = r'C:\Users\user\Downloads\archive\flight_fare_prediction_pipeline.pkl'
joblib.dump(best_model, pipeline_path)

['C:\\Users\\user\\Downloads\\archive\\flight_fare_prediction_pipeline.pkl']

In [33]:
pipeline_path

'C:\\Users\\user\\Downloads\\archive\\flight_fare_prediction_pipeline.pkl'

In [34]:
best_model
