## Importing all the necessary libraries

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score




## Loading data set

In [None]:
data = pd.read_csv("flight.csv")

# Preprocessing

### removing columns Route,Dept_Time and Arrival_Time and also Additional_info since we dont need them

## 1. check for null values and handle them by replacing with mean,meadian or mode
## 2. encoding all categorical columns
## 3. scaling

In [None]:
data.isnull().sum()

Unnamed: 0,0
Airline,0
Date_of_Journey,0
Source,0
Destination,0
Route,1
Dep_Time,0
Arrival_Time,0
Duration,0
Total_Stops,1
Additional_Info,0


In [None]:
data['Route'].fillna(data['Route'].mode()[0], inplace=True)
data['Total_Stops'].fillna(data['Total_Stops'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Route'].fillna(data['Route'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Total_Stops'].fillna(data['Total_Stops'].mode()[0],inplace=True)


In [None]:
data.isnull().sum()

Unnamed: 0,0
Airline,0
Date_of_Journey,0
Source,0
Destination,0
Route,0
Dep_Time,0
Arrival_Time,0
Duration,0
Total_Stops,0
Additional_Info,0


In [None]:
data = data.iloc[0:, [0,1, 2, 3,7,8,10]]  # Select first 5 rows and columns 0, 2, and 4
print(data)


           Airline Date_of_Journey    Source Destination Duration Total_Stops  \
0           IndiGo      24/03/2019  Banglore   New Delhi   2h 50m    non-stop   
1        Air India       1/05/2019   Kolkata    Banglore   7h 25m     2 stops   
2      Jet Airways       9/06/2019     Delhi      Cochin      19h     2 stops   
3           IndiGo      12/05/2019   Kolkata    Banglore   5h 25m      1 stop   
4           IndiGo      01/03/2019  Banglore   New Delhi   4h 45m      1 stop   
...            ...             ...       ...         ...      ...         ...   
10678     Air Asia       9/04/2019   Kolkata    Banglore   2h 30m    non-stop   
10679    Air India      27/04/2019   Kolkata    Banglore   2h 35m    non-stop   
10680  Jet Airways      27/04/2019  Banglore       Delhi       3h    non-stop   
10681      Vistara      01/03/2019  Banglore   New Delhi   2h 40m    non-stop   
10682    Air India       9/05/2019     Delhi      Cochin   8h 20m     2 stops   

       Price  
0       3897

In [None]:
data=pd.DataFrame(data)

In [None]:
data

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Duration,Total_Stops,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,2h 50m,non-stop,3897
1,Air India,1/05/2019,Kolkata,Banglore,7h 25m,2 stops,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,19h,2 stops,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,5h 25m,1 stop,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,4h 45m,1 stop,13302
...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,2h 30m,non-stop,4107
10679,Air India,27/04/2019,Kolkata,Banglore,2h 35m,non-stop,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,3h,non-stop,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,2h 40m,non-stop,12648


In [None]:
le=LabelEncoder()
data['Airline']=le.fit_transform(data['Airline'])
data['Date_of_Journey']=le.fit_transform(data['Date_of_Journey'])
data['Source']=le.fit_transform(data['Source'])
data['Destination']=le.fit_transform(data['Destination'])
data['Duration']=le.fit_transform(data['Duration'])
data['Total_Stops']=le.fit_transform(data['Total_Stops'])

In [None]:
sc=StandardScaler()
data['Duration']=sc.fit_transform(data[['Duration']])

## splitting data

In [None]:
x=data.iloc[:,0:-1]
y=data.iloc[:,-1]
y=pd.DataFrame(y)
x

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Duration,Total_Stops
0,3,24,0,5,0.422875,4
1,1,6,3,0,1.306727,1
2,4,43,2,1,-0.810835,1
3,3,10,3,0,1.076557,0
4,3,0,0,5,1.002903,0
...,...,...,...,...,...,...
10678,0,41,3,0,0.386047,4
10679,1,29,3,0,0.395254,4
10680,4,29,0,2,0.791146,4
10681,10,0,0,5,0.404461,4


In [None]:
y

Unnamed: 0,Price
0,3897
1,7662
2,13882
3,6218
4,13302
...,...
10678,4107
10679,4145
10680,7229
10681,12648


## splitting data into train and test

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=16)

## tarining algorithms

### Linear Regression

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
pred=lr.predict(x_test)
r2_score(y_test,pred)

0.33262741334261403

### KNN

In [None]:
knn=KNeighborsRegressor()
knn.fit(x_train,y_train)
pred1=knn.predict(x_test)
r2_score(y_test,pred1)

0.6478538168036121

### Support Vector Regressor

In [None]:
svr=SVR()
svr.fit(x_train,y_train)
pred2=svr.predict(x_test)
r2_score(y_test,pred2)

  y = column_or_1d(y, warn=True)


-0.016005217995448184

### Naive Bayes Regressor

In [None]:
nb=GaussianNB()
nb.fit(x_train,y_train)
pred3=nb.predict(x_test)
r2_score(y_test,pred3)

  y = column_or_1d(y, warn=True)


0.4728760504858234

### Decision Tree Regressor

In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
pred4=dtr.predict(x_test)
r2_score(y_test,pred4)

0.5866286324917129

### Random Forest Regressor

In [None]:
rfr=RandomForestRegressor()
rfr.fit(x_train,y_train)
pred5=rfr.predict(x_test)
r2_score(y_test,pred5)

  return fit_method(estimator, *args, **kwargs)


0.6467789056764215

### Extreme Gradient Boosting

In [None]:
model = XGBRegressor()
model.fit(x_train, y_train)
pred5 = model.predict(x_test)
r2_score(y_test,pred5)

0.714992344379425

# From the above trained models we can say that Extreme Gradient Boosting is the best model for out data
## so we will create a simple ui using Gradio tool

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv("flight.csv")  # Replace with your actual dataset

# Encode categorical features
airline_encoder = LabelEncoder()
df["Airline"] = airline_encoder.fit_transform(df["Airline"])
joblib.dump(airline_encoder, "airline_encoder.pkl")  # Save encoder

source_encoder = LabelEncoder()
df["Source"] = source_encoder.fit_transform(df["Source"])
joblib.dump(source_encoder, "source_encoder.pkl")  # Save encoder

destination_encoder = LabelEncoder()
df["Destination"] = destination_encoder.fit_transform(df["Destination"])
joblib.dump(destination_encoder, "destination_encoder.pkl")  # Save encoder

stops_encoder = LabelEncoder()
df["Total_Stops"] = stops_encoder.fit_transform(df["Total_Stops"])
joblib.dump(stops_encoder, "stops_encoder.pkl")  # Save encoder

# Convert "Duration" (e.g., "2h 30m") to float
def convert_duration(duration):
    hours, minutes = 0, 0
    parts = duration.split()
    for part in parts:
        if 'h' in part:
            hours = int(part.replace("h", ""))
        elif 'm' in part:
            minutes = int(part.replace("m", ""))
    return hours + (minutes / 60)  # Convert to decimal hours

df["Duration"] = df["Duration"].apply(convert_duration)

# Train XGBoost model
X = df[['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops']]
y = df['Price']

model = XGBRegressor()
model.fit(X, y)

# Save trained model
joblib.dump(model, "xgboost_flight_price.pkl")
print("✅ Model and encoders saved successfully!")

✅ Model and encoders saved successfully!


In [41]:
import gradio as gr
import numpy as np
import pandas as pd
import joblib  # For saving/loading models

# Load pre-trained XGBoost model
try:
    model = joblib.load("xgboost_flight_price.pkl")
except FileNotFoundError:
    raise FileNotFoundError("Model file not found! Train and save the model first.")

# Load Label Encoders (Saved during training)
try:
    airline_encoder = joblib.load("airline_encoder.pkl")
    source_encoder = joblib.load("source_encoder.pkl")
    destination_encoder = joblib.load("destination_encoder.pkl")
except FileNotFoundError:
    raise FileNotFoundError("Label Encoder files not found! Save them during training.")

# Define available cities
source_cities = ["Delhi", "Mumbai", "Kolkata", "Banglore", "Chennai"]
destination_cities = ["Hyderabad", "Cochin", "Kolkata", "Banglore", "New Delhi", "Delhi"]

def update_destination(source):
    return [city for city in destination_cities if city != source]

# Function to preprocess input and predict price
def predict_price(Airline, Source, Destination, Duration, Total_Stops):
    try:
        # Convert categorical values using trained LabelEncoders
        Airline = airline_encoder.transform([Airline])[0]
        Source = source_encoder.transform([Source])[0]
        Destination = destination_encoder.transform([Destination])[0]

        # Convert Duration to minutes (assuming model was trained with minutes)
        Duration = float(Duration) * 60

        # Create feature array for prediction
        input_data = np.array([[Airline, Source, Destination, Duration, Total_Stops]])

        # Predict price
        predicted_price = model.predict(input_data)[0]
        return f"Predicted Flight Price: ₹{predicted_price:.2f}"

    except Exception as e:
        return f"Error in prediction: {str(e)}"

# Create Gradio Interface
with gr.Blocks() as iface:
    gr.Markdown("## ✈ Flight Price Predictor")
    gr.Markdown("Enter flight details to predict the price using the XGBoost model.")

    airline = gr.Dropdown(["IndiGo", "Air India", "Jet Airways", "SpiceJet", "Vistara"], label="Airline")
    source = gr.Dropdown(source_cities, label="Source")
    destination = gr.Dropdown(destination_cities, label="Destination")
    duration = gr.Number(label="Duration (in hours)")
    total_stops = gr.Slider(0, 3, step=1, label="Total Stops")
    output = gr.Textbox(label="Predicted Price")

    source.change(fn=update_destination, inputs=[source], outputs=[destination])
    predict_button = gr.Button("Predict Price")
    predict_button.click(fn=predict_price, inputs=[airline, source, destination, duration, total_stops], outputs=[output])

iface.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3df8fca8563eb27797.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3df8fca8563eb27797.gradio.live


