## 1.Import Packages

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve

## 2.Display Settings

In [27]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
# sklearn.set_config(transform_output="default")

## 3. Data Gathering

In [28]:
data = pd.read_csv('data.csv')
# data = data.iloc[:2500]

In [29]:
data = data.drop(data.columns[0], axis=1)
data = data.drop(columns=["flight","stops"])
data

Unnamed: 0,airline,source_city,departure_time,arrival_time,destination_city,category,duration,days_left,price
0,SpiceJet,Delhi,Evening,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Morning,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,Chennai,Afternoon,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,Chennai,Early_Morning,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,Chennai,Early_Morning,Evening,Hyderabad,Business,10.00,49,81585


In [30]:
# columns_to_rename = {
#     'class': 'category'
# }
# # Renaming the columns
# data.rename(columns=columns_to_rename, inplace=True)
# data

In [31]:
X_train,X_test,y_train,y_test = train_test_split(
    data.drop(columns=["price"]),
    data["price"],
    test_size=0.2,
    random_state=42
)

In [32]:
X_train

Unnamed: 0,airline,source_city,departure_time,arrival_time,destination_city,category,duration,days_left
148417,Air_India,Kolkata,Afternoon,Morning,Chennai,Economy,19.42,6
36879,Indigo,Delhi,Afternoon,Night,Chennai,Economy,7.00,13
274531,Air_India,Kolkata,Afternoon,Afternoon,Chennai,Business,21.17,44
166397,Vistara,Hyderabad,Night,Early_Morning,Bangalore,Economy,10.25,11
272722,Vistara,Kolkata,Night,Night,Chennai,Business,26.50,5
...,...,...,...,...,...,...,...,...
119879,Vistara,Kolkata,Evening,Afternoon,Delhi,Economy,20.50,2
259178,Vistara,Bangalore,Night,Night,Chennai,Business,25.42,7
131932,Air_India,Kolkata,Night,Morning,Mumbai,Economy,13.67,29
146867,GO_FIRST,Kolkata,Afternoon,Night,Hyderabad,Economy,8.33,39


In [33]:
y_test

27131      7366
266857    64831
141228     6195
288329    60160
97334      6578
          ...  
5234       5026
5591       3001
168314     6734
175191     5082
287693    66465
Name: price, Length: 60031, dtype: int64

## 4. Data Preprocessing

In [34]:
# data.info()
X_train

Unnamed: 0,airline,source_city,departure_time,arrival_time,destination_city,category,duration,days_left
148417,Air_India,Kolkata,Afternoon,Morning,Chennai,Economy,19.42,6
36879,Indigo,Delhi,Afternoon,Night,Chennai,Economy,7.00,13
274531,Air_India,Kolkata,Afternoon,Afternoon,Chennai,Business,21.17,44
166397,Vistara,Hyderabad,Night,Early_Morning,Bangalore,Economy,10.25,11
272722,Vistara,Kolkata,Night,Night,Chennai,Business,26.50,5
...,...,...,...,...,...,...,...,...
119879,Vistara,Kolkata,Evening,Afternoon,Delhi,Economy,20.50,2
259178,Vistara,Bangalore,Night,Night,Chennai,Business,25.42,7
131932,Air_India,Kolkata,Night,Morning,Mumbai,Economy,13.67,29
146867,GO_FIRST,Kolkata,Afternoon,Night,Hyderabad,Economy,8.33,39


In [35]:
cat_cols = ["airline","source_city","departure_time","arrival_time","destination_city","category"]
num_cols = ["duration","days_left"]

In [36]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

In [37]:
preprocessor = ColumnTransformer(transformers=[
    ("cat", cat_transformer, cat_cols),
    ("num", num_transformer, num_cols)
])

In [38]:
preprocessor.fit_transform(X_train)

Unnamed: 0,cat__airline_AirAsia,cat__airline_Air_India,cat__airline_GO_FIRST,cat__airline_Indigo,cat__airline_SpiceJet,cat__airline_Vistara,cat__source_city_Bangalore,cat__source_city_Chennai,cat__source_city_Delhi,cat__source_city_Hyderabad,cat__source_city_Kolkata,cat__source_city_Mumbai,cat__departure_time_Afternoon,cat__departure_time_Early_Morning,cat__departure_time_Evening,cat__departure_time_Late_Night,cat__departure_time_Morning,cat__departure_time_Night,cat__arrival_time_Afternoon,cat__arrival_time_Early_Morning,cat__arrival_time_Evening,cat__arrival_time_Late_Night,cat__arrival_time_Morning,cat__arrival_time_Night,cat__destination_city_Bangalore,cat__destination_city_Chennai,cat__destination_city_Delhi,cat__destination_city_Hyderabad,cat__destination_city_Kolkata,cat__destination_city_Mumbai,cat__category_Business,cat__category_Economy,num__duration,num__days_left
148417,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.001730,-1.474883
36879,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.724634,-0.958973
274531,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.244978,1.325771
166397,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.272888,-1.106376
272722,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.985842,-1.548584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.151849,-1.769688
259178,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.835723,-1.401181
131932,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.202487,0.220250
146867,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.539766,0.957264


## 5. Model Selection

In [39]:
algorithms = {
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=10),
    "XGBoost": XGBRegressor(n_estimators=10),
}

In [40]:
X_data=data.drop(columns=['price'])
y_data=data['price']
print(X_data.shape, y_data.shape)

(300153, 8) (300153,)


In [41]:
def plot_curves(size, mean_scores, std_scores, label, ax):
    ax.plot(
        size,
        mean_scores,
        marker='o',
        label= label
    )
    ax.fill_between(
        x=size,
        y1= mean_scores - std_scores,
        y2=mean_scores + std_scores,
        alpha=0.4
    )

In [42]:
def plot_learning_curves(name, algorithm, figsize=(12, 4)):
	model = Pipeline(steps=[
		("pre", preprocessor),
		("alg", algorithm)
	])

	train_sizes, train_scores, test_scores = learning_curve(
		estimator=model,
		X=X_data,
		y=y_data,
		cv=3,
		scoring="r2",
		n_jobs=-1,
		random_state=42
	)
	
	mean_train_scores = np.mean(train_scores, axis=1)
	std_train_scores = np.std(train_scores, axis=1)
	train_score = f"{mean_train_scores[-1]:.2f} +/- {std_train_scores[-1]:.2f}"

	mean_test_scores = np.mean(test_scores, axis=1)
	std_test_scores = np.std(test_scores, axis=1)
	test_score = f"{mean_test_scores[-1]:.2f} +/- {std_test_scores[-1]:.2f}"

	fig, ax = plt.subplots(figsize=figsize)

	# training curve
	plot_curves(
		train_sizes,
		mean_train_scores,
		std_train_scores,
		f"Train ({train_score})",
		ax
	)

	# test curve
	plot_curves(
		train_sizes,
		mean_test_scores,
		std_test_scores,
		f"Test ({test_score})",
		ax
	)

	ax.set(xlabel="Training Set Size", ylabel="R-square", title=name)

	ax.legend(loc="lower right")

	plt.show()

In [143]:
for name, algorithm in algorithms.items():
    plot_learning_curves(name, algorithm)

## 6. Model Training

In [44]:
model = Pipeline(steps=[
    ("pre", preprocessor),
    ("rfr", RandomForestRegressor(n_estimators=10))
])
model.fit(X_data, y_data)

## 7. Model Evaluation

In [45]:
def evaluate_model(X, y):
	y_pred = model.predict(X)
	return r2_score(y, y_pred)

In [46]:
print(f"R2 score on Training data is = {evaluate_model(X_data, y_data)}")
print(f"R2 score on Test data is = {evaluate_model(X_test, y_test)}")

R2 score on Training data is = 0.9965747363020203
R2 score on Test data is = 0.9963997466633607


## 8. Model Persistence

In [47]:
import pickle
pickle.dump(model,open('model.pkl','wb'))

In [48]:
saved_model = pickle.load(open('model.pkl','rb'))
saved_model

In [49]:
saved_model.score(X_test, y_test)

0.9963997466633607