In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random


In [2]:
airlines = ["IndiGo", "Air India", "Vistara", "SpiceJet", "GoAir", "AirAsia"]
cities = ["Delhi", "Mumbai", "Bangalore", "Chennai", "Kolkata", "Hyderabad"]

rows = 20000
data = []

for _ in range(rows):
    airline = random.choice(airlines)
    source = random.choice(cities)
    destination = random.choice([c for c in cities if c != source])
    
    # random date in 2025
    date = datetime(2025, 1, 1) + timedelta(days=random.randint(0, 364))
    
    # duration & stops
    duration_mins = random.randint(60, 360)
    total_stops = random.randint(0, 2)   # 0, 1, 2
    
    # ---- KEY LOGIC: 0 stops > 1 stop > 2 stops ----
    stops_effect = (2 - total_stops) * 2000
    
    # base price by route distance (rough)
    base_price = random.randint(3000, 7000)
    
    # final price with some noise
    price = base_price + stops_effect + random.randint(-500, 500)
    
    data.append([
        airline, source, destination, date,
        duration_mins, total_stops, price
    ])

df = pd.DataFrame(data, columns=[
    "airline", "source", "destination", "date",
    "duration_mins", "total_stops", "price"
])

df.head()


Unnamed: 0,airline,source,destination,date,duration_mins,total_stops,price
0,AirAsia,Hyderabad,Chennai,2025-12-19,325,2,4384
1,Air India,Mumbai,Hyderabad,2025-04-09,170,0,10946
2,Air India,Delhi,Kolkata,2025-12-25,73,0,10423
3,Vistara,Mumbai,Delhi,2025-01-08,137,1,8699
4,IndiGo,Kolkata,Delhi,2025-03-27,140,2,3833


In [3]:
df.to_csv("flight_prices_predict.csv", index=False)

In [4]:
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

df['route'] = df['source'] + "_" + df['destination']
df['duration_hours'] = df['duration_mins'] / 60

df = df.drop(columns=['date'])
df.head()

Unnamed: 0,airline,source,destination,duration_mins,total_stops,price,day,month,year,route,duration_hours
0,AirAsia,Hyderabad,Chennai,325,2,4384,19,12,2025,Hyderabad_Chennai,5.416667
1,Air India,Mumbai,Hyderabad,170,0,10946,9,4,2025,Mumbai_Hyderabad,2.833333
2,Air India,Delhi,Kolkata,73,0,10423,25,12,2025,Delhi_Kolkata,1.216667
3,Vistara,Mumbai,Delhi,137,1,8699,8,1,2025,Mumbai_Delhi,2.283333
4,IndiGo,Kolkata,Delhi,140,2,3833,27,3,2025,Kolkata_Delhi,2.333333


In [5]:
df_encoded = pd.get_dummies(
    df,
    columns=['airline', 'source', 'destination', 'route'],
    drop_first=True
)

df_encoded.head()


Unnamed: 0,duration_mins,total_stops,price,day,month,year,duration_hours,airline_AirAsia,airline_GoAir,airline_IndiGo,...,route_Kolkata_Bangalore,route_Kolkata_Chennai,route_Kolkata_Delhi,route_Kolkata_Hyderabad,route_Kolkata_Mumbai,route_Mumbai_Bangalore,route_Mumbai_Chennai,route_Mumbai_Delhi,route_Mumbai_Hyderabad,route_Mumbai_Kolkata
0,325,2,4384,19,12,2025,5.416667,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,170,0,10946,9,4,2025,2.833333,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,73,0,10423,25,12,2025,1.216667,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,137,1,8699,8,1,2025,2.283333,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,140,2,3833,27,3,2025,2.333333,False,False,True,...,False,False,True,False,False,False,False,False,False,False


In [6]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=200,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)


MAE: 1017.5732937596885
RMSE: 1191.292645330015
R2: 0.6453384047701878


In [9]:
import joblib

# save trained model
joblib.dump(model, "flight_model.pkl")

# save feature/column names
joblib.dump(list(X.columns), "model_columns.pkl")


['model_columns.pkl']