In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
from tkcalendar import DateEntry

In [51]:
df = pd.read_csv("Cleaned.csv")

In [52]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Dep_Hr,Dep_Min,Arr_Hr,Arr_Min,Duration_Hr,Duration_Min
0,IndiGo,24-03-2019,Banglore,New Delhi,BLR → DEL,22:20,01:10,02:50,0,No Info,3897,24,3,2019,22,20,1,10,2,50.0
1,Air India,01-05-2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,07:25,2,No Info,7662,1,5,2019,5,50,13,15,7,25.0
2,Jet Airways,09-06-2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2,No Info,13882,9,6,2019,9,25,4,25,19h,
3,IndiGo,12-05-2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,05:25,1,No Info,6218,12,5,2019,18,5,23,30,5,25.0
4,IndiGo,01-03-2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,04:45,1,No Info,13302,1,3,2019,16,50,21,35,4,45.0


In [53]:
df.shape

(10682, 20)

In [54]:
df['Airline'].value_counts()

Airline
Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: count, dtype: int64

In [55]:
df['Source'].value_counts()

Source
Delhi       4536
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: count, dtype: int64

In [57]:
df['Destination'].value_counts()

Destination
Cochin       4536
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: count, dtype: int64

In [58]:
df['Total_Stops'].value_counts()

Total_Stops
1    5625
0    3491
2    1520
3      45
4       1
Name: count, dtype: int64

In [44]:
df.dropna(inplace = True)

In [45]:
df.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Day                0
Month              0
Year               0
Dep_Hr             0
Dep_Min            0
Arr_Hr             0
Arr_Min            0
Duration_Hr        0
Duration_Min       0
dtype: int64

In [46]:
df['Airline'].value_counts()

Airline
Jet Airways                          3367
IndiGo                               1932
Air India                            1628
Multiple carriers                     945
SpiceJet                              813
Vistara                               462
Air Asia                              306
GoAir                                 177
Multiple carriers Premium economy      11
Jet Airways Business                    5
Vistara Premium economy                 3
Trujet                                  1
Name: count, dtype: int64

In [7]:
# Convert categorical data to numerical data using LabelEncoder
label_encoder = LabelEncoder()
df['Airline'] = label_encoder.fit_transform(df['Airline'])
airline_classes = label_encoder.classes_
df['Source'] = label_encoder.fit_transform(df['Source'])
source_classes = label_encoder.classes_
df['Destination'] = label_encoder.fit_transform(df['Destination'])
destination_classes = label_encoder.classes_
df['Total_Stops'] = label_encoder.fit_transform(df['Total_Stops'])
stopage_classes = label_encoder.classes_

In [8]:
# Define feature columns and target
features = ['Airline', 'Day', 'Month', 'Year', 'Source', 'Destination', 'Total_Stops']
X = df[features]
y = df['Price']

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define a dictionary to store models and their names
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42)
}

In [11]:
# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, r2

In [12]:
# Evaluate all models
results = {}
for name, model in models.items():
    mae, mse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}

In [13]:
# Print the results
for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {metrics['MAE']}")
    print(f"Mean Squared Error: {metrics['MSE']}")
    print(f"R2 Score: {metrics['R2']}")
    print("="*30)

Model: Linear Regression
Mean Absolute Error: 2519.3601220393207
Mean Squared Error: 11552746.70784508
R2 Score: 0.4461337331916272
Model: Random Forest
Mean Absolute Error: 1399.3951546197825
Mean Squared Error: 4367189.427772019
R2 Score: 0.7906265093510161
Model: Gradient Boosting
Mean Absolute Error: 1539.4132412842064
Mean Squared Error: 4671368.754301711
R2 Score: 0.776043426012845
Model: Decision Tree
Mean Absolute Error: 1416.3728077179733
Mean Squared Error: 4613783.173254769
R2 Score: 0.7788042162909529
Model: Support Vector Regression
Mean Absolute Error: 3660.904133870433
Mean Squared Error: 21476770.809421062
R2 Score: -0.029647682246457308
Model: K-Nearest Neighbors
Mean Absolute Error: 1529.1825906735753
Mean Squared Error: 5538760.924683938
R2 Score: 0.7344585739064469
Model: XGBoost
Mean Absolute Error: 1382.3007398219925
Mean Squared Error: 4318414.926136808
R2 Score: 0.7929648756980896
Model: AdaBoost
Mean Absolute Error: 2475.8634511732607
Mean Squared Error: 121660

In [14]:
# Select the best model
best_model_name = min(results, key=lambda x: results[x]['MAE'])
best_model = models[best_model_name]
print(f"Best model based on MAE: {best_model_name}")

Best model based on MAE: XGBoost


In [15]:
# Save the best model
joblib.dump(best_model, 'best_flight_fare_model.pkl')

['best_flight_fare_model.pkl']