## 1. Importing Libraries

In [850]:
import os
import matplotlib.pyplot as plt
import pickle
import json
from functools import lru_cache
import warnings
import catboost as cb
from geopy.distance import geodesic as GD
from geopy.geocoders import Nominatim
import numpy as np

import pandas as pd

import xgboost as xgb
from sklearn.linear_model import LinearRegression
import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

## 2. Display Settings

In [851]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")
geolocator = Nominatim(user_agent="MyApp")

## 3. Read Datasets

In [852]:
file_dir = r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker\data"

In [853]:
with open(r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker\dictionary_distance", 'r') as json_file:
    distance_dictionary = json.load(json_file)

In [854]:
train = pd.read_csv(os.path.join(file_dir, "train.csv"))
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
0,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1,12898
1,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1,13044
2,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2,10975
3,IndiGo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0,2227
4,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0,5678
...,...,...,...,...,...,...,...,...,...
6689,SpiceJet,2019-06-09,Kolkata,Banglore,11:35:00,18:50:00,435,1,8479
6690,Multiple carriers,2019-05-09,Delhi,Cochin,10:00:00,01:30:00,930,1,15078
6691,Air India,2019-05-18,Delhi,Cochin,12:00:00,07:40:00,1180,2,8603
6692,Air Asia,2019-05-18,Delhi,Cochin,07:55:00,13:25:00,330,1,8759


In [855]:
val = pd.read_csv(os.path.join(file_dir, "val.csv"))
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
0,Jet Airways,2019-05-27,Delhi,Cochin,09:00:00,19:00:00,600,1,10675
1,Jet Airways,2019-05-24,Kolkata,Banglore,18:55:00,10:05:00,910,1,8586
2,Jet Airways,2019-03-18,Banglore,Delhi,21:25:00,09:30:00,725,1,13555
3,SpiceJet,2019-06-27,Chennai,Kolkata,17:45:00,20:05:00,140,0,3543
4,Air Asia,2019-05-15,Kolkata,Banglore,07:35:00,19:25:00,710,1,5192
...,...,...,...,...,...,...,...,...,...
1669,Vistara,2019-05-06,Kolkata,Banglore,07:10:00,22:40:00,930,1,8452
1670,IndiGo,2019-04-03,Delhi,Cochin,21:05:00,00:20:00,195,0,5021
1671,Air India,2019-03-01,Banglore,Delhi,17:00:00,19:45:00,165,0,25913
1672,Air India,2019-06-18,Mumbai,Hyderabad,06:20:00,07:40:00,80,0,3100


In [856]:
test = pd.read_csv(os.path.join(file_dir, "test.csv"))
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
0,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1,17996
1,SpiceJet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0,3873
2,IndiGo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0,4462
3,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0,2228
4,SpiceJet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,170,0,4991
...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1,12898
2089,Jet Airways,2019-05-27,Delhi,Cochin,02:15:00,19:00:00,1005,1,12898
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1,11627
2091,Multiple carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1,6795


## 4. Preprocessing Operations

In [857]:
# airline
# airline_transformer = Pipeline(steps = [
#                                     ("grouper", RareLabelEncoder(tol=  0.1, replace_with = "Other", n_categories=2)),
#                                     ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    
# ])


airline_transformer = Pipeline(steps = [
                                    ("grouper", RareLabelEncoder(tol=  0.05, replace_with = "Other", n_categories=2)),
                                    # ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
                                    ("mean_encoder", MeanEncoder()),
                                	("scaler", PowerTransformer())
    
])

# -------------------------------------------------------------------------------------------


# #doj
# feature_to_extract = ["month", "week", "day_of_week" ]

# doj_transformer = Pipeline(steps=[
# 	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
# 	("scaler", MinMaxScaler())
# ])

feature_to_extract = ["week", "day_of_week"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])



# -------------------------------------------------------------------------------------------

# source & destination

def source_destination(train):
    sd_subset = train[["source","destination"]]
    sd_subset[["source", "destination"]] = sd_subset[["source", "destination"]].apply(lambda col: col.str.lower())
    sd_subset["source_destination"] = (sd_subset["source"].astype(str) + "_" + sd_subset["destination"])
    
    return pd.DataFrame(sd_subset["source_destination"], columns=["source_destination"])


def return_distance_df(train):
    distance = train["source_destination"].map(distance_dictionary)
    
    # Create a DataFrame with the calculated distance
    return pd.DataFrame({"distance_between_cities": distance})

    
## Lets fist convert airline coulumn to Countvectorizer() finction of feature_engine
sd_count_transformer = Pipeline(steps = [
                                    ( "source_destination" , FunctionTransformer(source_destination)),
                                    ("grouper", RareLabelEncoder(tol=  0.1, replace_with = "Other", n_categories=2)),
                                    # ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
                                    ("count_encoder", CountFrequencyEncoder()),
                                	("scaler", PowerTransformer())
    
])

location_pipe2 = Pipeline(steps=[
    ( "source_destination" , FunctionTransformer(source_destination)),
    # ("grouper", RareLabelEncoder(tol=  0.1, replace_with = "Other", n_categories=2)),
	("encoder", FunctionTransformer(func=return_distance_df)),
    ("StandardScaler", StandardScaler())
])


location_transformer = FeatureUnion(transformer_list=[
	("part1", sd_count_transformer),
    ("part2", location_pipe2),
])

# -------------------------------------------------------------------------------------------



# dep_time & arrival_time

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
	columns = X.columns.to_list()
	X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

	return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)


def dep_arrival(train):
    time_subset = train[["dep_time_part_of_day","arrival_time_part_of_day"]]
    time_subset[["dep_time_part_of_day", "arrival_time_part_of_day"]] = time_subset[["dep_time_part_of_day", "arrival_time_part_of_day"]].apply(lambda col: col.str.lower())
    time_subset["dept_arrival"] = (time_subset["dep_time_part_of_day"].astype(str) + "_" + time_subset["arrival_time_part_of_day"])
    
    return pd.DataFrame(time_subset["dept_arrival"], columns=["dept_arrival"])


time_transformer = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("label_encoder", FunctionTransformer(func=dep_arrival)),
    ("count_encoder", CountFrequencyEncoder()),
    ("scaler", PowerTransformer())
])


# --------------------------------------------------------------------------------------      

# duration    

## Standarize columns=  "total_stops", "duration"

stand = Pipeline(steps=[
    
	("scaling", StandardScaler())
])


# ----------------------------------------------------------------------------------


# column transformer
column_transformer = ColumnTransformer(transformers=[
	("air", airline_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
    ("time", time_transformer, ["dep_time", "arrival_time"]),
    ("stand", stand, ["duration", "total_stops"])],remainder="passthrough" ) 


# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [858]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [859]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline,doj__date_of_journey_week,location__source_destination,location__distance_between_cities,time__dept_arrival,stand__duration,stand__total_stops
0,1.027531,0.764706,1.071620,0.877331,-1.987895,0.599507,0.284658
1,1.027531,0.882353,-0.253821,-0.465397,-0.818215,1.283545,0.284658
2,0.095270,0.647059,1.071620,0.877331,1.541732,1.551211,1.804199
3,-1.243225,0.823529,-1.771796,-2.959122,-0.095486,-1.085803,-1.234882
4,1.027531,0.294118,-1.771796,-2.959122,-0.526121,-1.095717,-1.234882
...,...,...,...,...,...,...,...
6689,-1.573432,0.823529,-0.253821,-0.465397,0.743214,-0.401765,0.284658
6690,0.620932,0.588235,1.071620,0.877331,-0.049347,0.579680,0.284658
6691,0.095270,0.647059,1.071620,0.877331,-1.764013,1.075359,1.804199
6692,-0.942493,0.647059,1.071620,0.877331,0.359861,-0.609951,0.284658


## 4. Preprocess & Save data

In [860]:
def get_file_name(name):
    
    return os.path.join(file_dir, f"{name}.csv")

In [861]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = preprocessor.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False)
    )

In [862]:
export_data(train, "train_pre", preprocessor)
export_data(val, "val_pre", preprocessor)
export_data(test, "test_pre", preprocessor)

In [863]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
0,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1,12898
1,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1,13044
2,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2,10975
3,IndiGo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0,2227
4,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0,5678


## 5. Model and Hyperparameter Tuning Set-up

In [864]:
train_pre = pd.read_csv(os.path.join(file_dir, "train_pre.csv"))
val_pre = pd.read_csv(os.path.join(file_dir, "val_pre.csv"))
test_pre = pd.read_csv(os.path.join(file_dir, "test_pre.csv"))

In [865]:
train_pre.head()

Unnamed: 0,price,air__airline,doj__date_of_journey_week,location__source_destination,location__distance_between_cities,time__dept_arrival,stand__duration,stand__total_stops
0,12898,1.027531,0.764706,1.07162,0.877331,-1.987895,0.599507,0.284658
1,13044,1.027531,0.882353,-0.253821,-0.465397,-0.818215,1.283545,0.284658
2,10975,0.09527,0.647059,1.07162,0.877331,1.541732,1.551211,1.804199
3,2227,-1.243225,0.823529,-1.771796,-2.959122,-0.095486,-1.085803,-1.234882
4,5678,1.027531,0.294118,-1.771796,-2.959122,-0.526121,-1.095717,-1.234882


In [866]:
X_train = train_pre.drop("price", axis=1)
y_train_original = train_pre.price
y_train_model = y_train_original

In [867]:
X_val= val_pre.drop("price", axis=1)
y_val_original = val_pre.price
y_val_model = y_val_original

In [868]:
# Calculating Mean Absolute Percentage Error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [869]:
def results(y_train,y_train_pred, model_name):
    print(f"Train Results for {model_name}:")
    print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
    print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_train, y_train_pred)))
    print("R-Squared: ", r2_score(y_train, y_train_pred))
    

In [870]:
X_train.shape

(6694, 7)

## Linear Regression

In [871]:
from sklearn.metrics import mean_squared_error, r2_score

In [872]:
lr_model = LinearRegression()

In [873]:
lr_model.fit(X_train,y_train_model)

In [874]:
# Predictions on validation data
# y_train_pred = np.exp(lr_model.predict(X_train))
# y_val_pred = np.exp(lr_model.predict(X_val))
y_train_pred = lr_model.predict(X_train)
y_val_pred = lr_model.predict(X_val)

In [875]:
results(y_train_original,y_train_pred, "Linear Regression Model")

Train Results for Linear Regression Model:
Root Mean Squared Error:  3225.1931563646863
Mean Absolute % Error:  25
R-Squared:  0.5230733469219044


In [876]:
results(y_val_original,y_val_pred, "Linear Regression Model")

Train Results for Linear Regression Model:
Root Mean Squared Error:  3041.8915592454473
Mean Absolute % Error:  26
R-Squared:  0.5484255839084176


## Ridge Regression

In [877]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [878]:
# Performing GridSearchCV on Ridge Regression
params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
ridge_regressor = GridSearchCV(Ridge(), params, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)
ridge_regressor.fit(X_train, y_train_model)

In [879]:
y_train_pred = ridge_regressor.predict(X_train)
y_val_pred = ridge_regressor.predict(X_val)

In [880]:
results(y_train_original,y_train_pred, "Ridge regression Model")

Train Results for Ridge regression Model:
Root Mean Squared Error:  3227.838570895249
Mean Absolute % Error:  25
R-Squared:  0.5222906427123044


In [881]:
results(y_val_original,y_val_pred, "Ridge Regression Model")

Train Results for Ridge Regression Model:
Root Mean Squared Error:  3038.588377566659
Mean Absolute % Error:  25
R-Squared:  0.5494057782572828


## Lasso Regression

In [882]:
from sklearn.linear_model import Lasso

In [883]:
# Performing GridSearchCV on Lasso Regression
params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
lasso_regressor = GridSearchCV(Lasso(), params ,cv = 15,scoring = 'neg_mean_absolute_error', n_jobs = -1)
lasso_regressor.fit(X_train, y_train_model)

In [884]:
y_train_pred = lasso_regressor.predict(X_train)
y_val_pred = lasso_regressor.predict(X_val)

In [885]:
results(y_train_original,y_train_pred, "Lasso regression Model")

Train Results for Lasso regression Model:
Root Mean Squared Error:  3258.4545991076698
Mean Absolute % Error:  25
R-Squared:  0.5131855251397582


In [886]:
results(y_val_original,y_val_pred, "Lasso Regression Model")

Train Results for Lasso Regression Model:
Root Mean Squared Error:  3061.85697772087
Mean Absolute % Error:  25
R-Squared:  0.5424783239236667


## Decision Tree Regressor

In [887]:
from sklearn.tree import DecisionTreeRegressor

In [888]:
# Performing GridSearchCV on Decision Tree Regression
depth = list(range(3,30))
param_grid = dict(max_depth = depth)
tree = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 10)
tree.fit(X_train,y_train_model)

In [889]:
# Predicting train and test results
y_train_pred = tree.predict(X_train)
y_val_pred = tree.predict(X_val)

In [890]:
results(y_train_original,y_train_pred, "Decision trees Regressor Model")

Train Results for Decision trees Regressor Model:
Root Mean Squared Error:  2063.7673288551955
Mean Absolute % Error:  15
R-Squared:  0.804718276052443


In [891]:
results(y_val_original,y_val_pred, "Decision trees Regressor Model")

Train Results for Decision trees Regressor Model:
Root Mean Squared Error:  2327.7357830515593
Mean Absolute % Error:  17
R-Squared:  0.7355709047128087


## Random Forest Regressor

In [892]:
from sklearn.ensemble import RandomForestClassifier

In [893]:
depth = list(range(3, 25,3))
n_estimators = [50,100,150,200]  # Correct name: n_estimators
param_grid = {"max_depth": depth, "n_estimators": n_estimators}

# Initializing GridSearchCV with RandomForestRegressor
rf_tree = GridSearchCV(RandomForestRegressor(), param_grid, cv=10, n_jobs = -1)

# Fit the model on training data
rf_tree.fit(X_train, y_train_model)

In [894]:
# Predicting train and test results
y_train_pred =rf_tree.predict(X_train)
y_val_pred = rf_tree.predict(X_val)

In [895]:
results(y_train_original,y_train_pred, "Random Forest trees Regressor Model")

Train Results for Random Forest trees Regressor Model:
Root Mean Squared Error:  1553.605615956538
Mean Absolute % Error:  11
R-Squared:  0.8893320922456934


In [896]:
results(y_val_original,y_val_pred, "Random Forest trees Regressor Model")

Train Results for Random Forest trees Regressor Model:
Root Mean Squared Error:  2320.056632270169
Mean Absolute % Error:  15
R-Squared:  0.7373127187712583


## XGboost Regressor

In [897]:
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],  # learning rate (eta)
    'max_depth': [3, 5, 7],             # maximum depth of the trees
    'n_estimators': [50, 100, 200],     # number of boosting rounds
    'subsample': [0.8, 1.0]             # fraction of samples used per boosting round
}


grid_search = GridSearchCV(estimator=xgboost_model,
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5,  # 5-fold cross-validation
                           verbose=1, n_jobs=-1)

In [898]:
# Step 5: Fit the model
grid_search.fit(X_train, y_train_model)

# Step 6: Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score (neg MSE): ", grid_search.best_score_)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best score (neg MSE):  -5708773.771678989


In [899]:
best_model = grid_search.best_estimator_

In [900]:
# Predicting train and test results
y_train_pred =best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)

In [901]:
results(y_train_original,y_train_pred, "XGboost Regressor Model")

Train Results for XGboost Regressor Model:
Root Mean Squared Error:  1833.065667449916
Mean Absolute % Error:  14
R-Squared:  0.8459377884864807


In [902]:
results(y_val_original,y_val_pred, "XGboost Regressor Model")

Train Results for XGboost Regressor Model:
Root Mean Squared Error:  2165.178614060815
Mean Absolute % Error:  15
R-Squared:  0.7712140679359436


## Catboost Regressor

In [903]:
# catboost_model = cb.CatBoostRegressor(verbose=0, random_state=42)

# # Step 4: Define the parameter grid for GridSearchCV
# param_grid = {
#     'depth': [4, 6, 8],             # Tree depth
#     'learning_rate': [0.01, 0.1, 0.2],  # Learning rate (eta)
#     'iterations': [100, 200, 500],   # Number of boosting iterations
#     'l2_leaf_reg': [1, 3, 5]         # L2 regularization coefficient
# }

# # Step 5: Set up GridSearchCV
# grid_search = GridSearchCV(estimator=catboost_model,
#                            param_grid=param_grid,
#                            scoring='neg_mean_squared_error',
#                            cv=3,  # 3-fold cross-validation
#                            verbose=1, n_jobs=-1)

In [904]:
# # Step 6: Fit the model using the grid search
# grid_search.fit(X_train, y_train_model)

# # Step 6: Print the best parameters and best score
# print("Best parameters found: ", grid_search.best_params_)
# print("Best score (neg MSE): ", grid_search.best_score_)


In [905]:
# best_model = grid_search.best_estimator_

In [906]:
# # Predicting train and test results
# y_train_pred =best_model.predict(X_train)
# y_val_pred = best_model.predict(X_val)

In [907]:
# results(y_train_original,y_train_pred, "XGboost Regressor Model")

In [908]:
# results(y_val_original,y_val_pred, "XGboost Regressor Model")

## Test Accuracy

In [909]:
X_test= test_pre.drop("price", axis=1)
y_test_original = test_pre.price
y_test_model = y_test_original

In [910]:
y_test_pred = best_model.predict(X_test)

In [911]:
results(y_test_original,y_test_pred, "XGboost Regressor Model")

Train Results for XGboost Regressor Model:
Root Mean Squared Error:  1932.381572753738
Mean Absolute % Error:  16
R-Squared:  0.8194934129714966


## Saving Best model

In [912]:
save_path = r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker\xgboost_model.pkl"  

# Step 4: Save the model using pickle
with open(save_path, 'wb') as model_file:
    pickle.dump(best_model, model_file)

In [913]:
with open(r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker\xgboost_model.pkl", "rb") as f:
		model = pickle.load(f)