In [1]:
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.pipeline import (Pipeline,
                              FeatureUnion)

from feature_engine.encoding import (RareLabelEncoder,
                                     MeanEncoder,
                                     CountFrequencyEncoder
                                    )

from sklearn.preprocessing import (OneHotEncoder,
                                   StandardScaler,
                                   MinMaxScaler,
                                   PowerTransformer,
                                   FunctionTransformer
                                  )

from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance

from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

sklearn.set_config(transform_output='default')
pd.set_option("display.max_columns", None)

## 2. IMPORTING DATASETS

In [2]:
train = pd.read_csv("../Data/Train.csv")
test = pd.read_csv("../Data/Test.csv")
val = pd.read_csv("../Data/Validation.csv")

In [3]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,class_,price
0,Indigo,2019-05-09,Delhi,Cochin,06:50:00,16:10:00,560,1.0,No Info,Economy,6442
1,Air India,2019-04-03,Mumbai,Hyderabad,06:20:00,07:40:00,80,0.0,No Info,Economy,3100
2,Spicejet,2019-05-15,Kolkata,Banglore,11:15:00,18:30:00,435,1.0,No Info,Economy,8844
3,Jet Airways,2019-06-09,Delhi,Cochin,19:45:00,12:35:00,1010,1.0,In-flight meal not included,Economy,10577
4,Jet Airways,2019-06-18,Delhi,Cochin,09:50:00,13:05:00,195,0.0,No Info,Economy,9564


In [4]:
test.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,class_,price
0,Spicejet,2019-03-09,Chennai,Kolkata,08:20:00,10:35:00,135,0.0,No Info,Economy,6300
1,Jet Airways,2019-05-27,Delhi,Cochin,13:25:00,04:25:00,900,2.0,No Info,Economy,16704
2,Multiple Carriers,2019-05-09,Delhi,Cochin,11:30:00,01:30:00,840,1.0,No Info,Economy,15078
3,Indigo,2019-06-18,Kolkata,Banglore,21:25:00,00:05:00,160,0.0,No Info,Economy,4804
4,Multiple Carriers,2019-05-21,Delhi,Cochin,14:00:00,21:00:00,420,1.0,No Info,Economy,14067


In [5]:
val.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,class_,price
0,Spicejet,2019-06-01,Banglore,Delhi,05:55:00,08:35:00,160,0.0,No Info,Economy,3625
1,Jet Airways,2019-06-21,Banglore,Delhi,17:45:00,20:45:00,180,0.0,In-flight meal not included,Economy,7754
2,Jet Airways,2019-03-12,Banglore,New Delhi,05:45:00,20:20:00,875,1.0,No Info,Economy,13817
3,Jet Airways,2019-06-12,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,No Info,Economy,5678
4,Jet Airways,2019-06-27,Delhi,Cochin,15:00:00,12:35:00,1295,1.0,In-flight meal not included,Economy,10262


In [6]:
train['date_of_journey'] = pd.to_datetime(train['date_of_journey'], errors='coerce')
train['dep_time'] = pd.to_datetime(train['dep_time'], errors='coerce')
train['arrival_time'] = pd.to_datetime(train['arrival_time'], errors='coerce')


## 3. PREPROCESSING OPERATIONS

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   airline          6695 non-null   object        
 1   date_of_journey  6695 non-null   datetime64[ns]
 2   source           6695 non-null   object        
 3   destination      6695 non-null   object        
 4   dep_time         6695 non-null   datetime64[ns]
 5   arrival_time     6695 non-null   datetime64[ns]
 6   duration         6695 non-null   int64         
 7   total_stops      6695 non-null   float64       
 8   additional_info  6695 non-null   object        
 9   class_           6695 non-null   object        
 10  price            6695 non-null   int64         
dtypes: datetime64[ns](3), float64(1), int64(2), object(5)
memory usage: 575.5+ KB


In [8]:
train.isnull().sum()

airline            0
date_of_journey    0
source             0
destination        0
dep_time           0
arrival_time       0
duration           0
total_stops        0
additional_info    0
class_             0
price              0
dtype: int64

In [9]:
# AIRLINE PREPROCESSING
Airline_Transformer = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy="most_frequent")),
    ("Grouper", RareLabelEncoder(tol=0.1,
                                 n_categories = 2,
                                 replace_with = "Other")),
     ("OneHotEncoder", OneHotEncoder(handle_unknown='ignore',
                                    sparse_output=False))
])

# DATA OF JOURNEY PREPROCESSING
DOJ_Transformer = Pipeline(steps=[
    ("DateTime", DatetimeFeatures(features_to_extract=['month', 'week', 'day_of_week', 'day_of_month'],
                                 yearfirst=True,
                                format='mixed')),
    ("Scalar", MinMaxScaler())
])

# SOURCE AND DESTINATION PREPROCESSING
SourceDestination_Pipe = Pipeline(steps=[
    ("Grouper", RareLabelEncoder(tol=0.1,
                                n_categories=2,
                                replace_with="Other"
                               )),
    ("MeanEncoder", MeanEncoder()),
    ("Scalar", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ['Delhi', 'New Delhi', 'Kolkata']
    return (
        X.assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in X.columns
        }).drop(columns=columns)
    )

SourceDestination_Transformer = FeatureUnion(transformer_list=[
    ("Part-1", SourceDestination_Pipe),
    ("Part-2", FunctionTransformer(func=is_north))
])


# ARRIVAL TIME DEPARTURE TIME PREPROCESSING
time_pipe_1 = Pipeline(steps=[
    ("DT", DatetimeFeatures(features_to_extract=['hour', 'minute'])),
    ("Scalar", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, evening=16, night=20):
    columns = X.columns.to_list()
    X_Temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_Temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_Temp.loc[:, col].between(morning, noon, inclusive='left'),
                 X_Temp.loc[:, col].between(noon, evening, inclusive='left'),
                 X_Temp.loc[:, col].between(evening, night, inclusive='left')],
                ['morning', 'afternoon', 'evening'],
                default='night'
            )
            for col in columns
        })
        .drop(columns=X.columns)
    )

time_pipe_2 = Pipeline(steps=[
    ("Part_of_Day", FunctionTransformer(func=part_of_day)),
    ("Encoder", CountFrequencyEncoder()),
    ("Scalar", MinMaxScaler())
    
])

ArrivalDepartureTime_Transformer = FeatureUnion(transformer_list=[
    ("Part_1", time_pipe_1),
    ("Part_2", time_pipe_2)
])


# DURATION PREPROCESSING

Duration_Transformer = Pipeline(steps=[
    ("Outlier", Winsorizer(capping_method='iqr', fold=1.5)),
    ('Imputer', SimpleImputer(strategy='median')),
    ("scalar", PowerTransformer())
])


# TOTAL STOP PREPROCESSING

def is_direct(X):
    # Check if X is a NumPy array, and if so, convert it to a DataFrame
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=['total_stops'])  # Specify correct column name(s) here
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

TotalStop_Transformer = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy='most_frequent')),
    ("isDirectFlight", FunctionTransformer(func=is_direct))
])


# ADDITIONAL INFO PREPROCESSING
AdditionalInfo_Transformation = Pipeline(steps=[
    ("Grouper", RareLabelEncoder(tol=0.2, n_categories=2, replace_with='Info')),
    ("Encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


# CLASS PREPROCESSING 
Class_Transformer = Pipeline(steps=[
    ("Encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore' ))
])


# COLUMNS TRANSFORMER

Column_Transformer = ColumnTransformer(transformers=[
    ("Airline", Airline_Transformer, ['airline']),
    ("DOJ", DOJ_Transformer, ['date_of_journey']),
    ("Location", SourceDestination_Transformer, ["source", 'destination']),
    ("Time", ArrivalDepartureTime_Transformer, ["dep_time", "arrival_time"]),
    ("Duration", Duration_Transformer, ['duration']),
    ("Stops", TotalStop_Transformer, ['total_stops']),
    ("Info", AdditionalInfo_Transformation, ['additional_info']),
    ("Class", Class_Transformer, ['class_'])
], remainder='passthrough')


estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", Column_Transformer),
("selector", selector)
])



In [10]:
X_Train = train.drop(['price'], axis = 1)
Y_Train = train['price']

In [11]:
preprocessor.fit(
    train.drop(columns='price'),
    train.price.copy()
)

preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,x1,x2,x4,x6,x9,x10,x19,x20,x21
0,1.0,0.0,0.0,0.588235,1.040285,1.039038,0.242156,1.0,0.0
1,0.0,0.0,0.0,0.294118,-1.878057,-0.848449,-1.777309,0.0,1.0
2,0.0,0.0,1.0,0.647059,-0.205899,-0.211326,-0.043462,1.0,0.0
3,0.0,1.0,0.0,0.823529,1.040285,1.039038,0.938128,1.0,0.0
4,0.0,1.0,0.0,0.941176,1.040285,1.039038,-0.903335,0.0,1.0
...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,0.0,0.000000,1.040285,1.039038,0.785363,1.0,0.0
6691,0.0,0.0,1.0,0.764706,-0.904774,-1.831073,-1.043363,0.0,1.0
6692,0.0,1.0,0.0,0.823529,1.040285,1.039038,1.272623,1.0,0.0
6693,1.0,0.0,0.0,0.411765,-0.904774,-1.831073,-0.985271,0.0,1.0


In [16]:
joblib.dump(preprocessor, "preprocessorMain.joblib")

['preprocessorMain.joblib']

In [12]:
transformed_data = Column_Transformer.fit_transform(X_Train, Y_Train)
transformed_df = pd.DataFrame(transformed_data)
transformed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,0.0,1.0,0.0,0.0,0.0,0.666667,0.588235,0.5,0.307692,1.040285,1.039038,1.0,0.0,0.26087,0.909091,0.695652,0.181818,1.0,0.654008,0.242156,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.333333,0.294118,0.333333,0.076923,-1.878057,-0.848449,0.0,0.0,0.26087,0.363636,0.304348,0.727273,1.0,0.855485,-1.777309,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.666667,0.647059,0.333333,0.538462,-0.205899,-0.211326,1.0,0.0,0.478261,0.272727,0.782609,0.545455,1.0,0.654008,-0.043462,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.823529,1.0,0.307692,1.040285,1.039038,1.0,0.0,0.826087,0.818182,0.521739,0.636364,0.211834,0.0,0.938128,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.941176,0.166667,0.653846,1.040285,1.039038,1.0,0.0,0.391304,0.909091,0.565217,0.090909,1.0,0.0,-0.903335,0.0,1.0,0.0,1.0,0.0,1.0,0.0


## 4. MODEL SELECTION

#### IMPORTING ALL LIBRARIES WHICH WE WILL USE FOR MODEL SELECTION

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import RandomizedSearchCV

#### MAKING DICTIONARY OF ALL ML ALGORITHM THAT WE WILL TRY

In [14]:
algorithm = {
    "LINEAR REGRESSION": LinearRegression(),
    "SUPPORT VECTOR MACHINE": SVR(),
    "RANDOM FOREST": RandomForestRegressor(n_estimators=10),
    "XG BOOST": XGBRegressor(n_estimators = 10),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "GRADIENT BOOSTING REGRESSOR": GradientBoostingRegressor(),
    "DECISION TREE REGRESSOR": DecisionTreeRegressor()
    
}

#### SPLITING DATA

In [78]:

def split_data(data):
    x = data.drop(columns='price')
    y = data['price']
    return x, y

data = pd.concat([train, val], axis=0)
X_data, y_data = split_data(train)
x_val, y_val = split_data(val)
x_test, y_test = split_data(test)

#### PLOTTING PLOTS TO CHECK AND COMPARE SCORE

In [79]:
def evaluate_model(X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

In [80]:

model = Pipeline(steps=[
    ("pre", preprocessor),
    ("XG Boost", XGBRegressor(n_estimators = 10))
])

model.fit(X_data, y_data)

print("TRAINING SCORE :- ",evaluate_model(X_data, y_data))
print("TESTING SCORE :- ",evaluate_model(x_test, y_test))
print("VALIDATION SCORE :-",evaluate_model(x_val, y_val))

TRAINING SCORE :-  0.7924318313598633
TESTING SCORE :-  0.7224295139312744
VALIDATION SCORE :- 0.7287412881851196


In [81]:
model = Pipeline(steps=[
    ("pre", preprocessor),
    ("XG Boost", GradientBoostingRegressor())
])

model.fit(X_data, y_data)

print("TRAINING SCORE :- ",evaluate_model(X_data, y_data))
print("TESTING SCORE :- ",evaluate_model(x_test, y_test))
print("VALIDATION SCORE :-",evaluate_model(x_val, y_val))


TRAINING SCORE :-  0.7467180579711981
TESTING SCORE :-  0.6885594774553522
VALIDATION SCORE :- 0.7083795988572368


In [82]:
algorithm = {
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=10),
    "XG Boost": XGBRegressor(n_estimators = 10),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "GradientBoosting": GradientBoostingRegressor(),
    "DecisionTree": DecisionTreeRegressor()
    
}

In [83]:
model = Pipeline(steps=[
    ("pre", preprocessor),
    ("XG Boost", KNeighborsRegressor(n_neighbors=5))
])

model.fit(X_data, y_data)

print("TRAINING SCORE :- ",evaluate_model(X_data, y_data))
print("TESTING SCORE :- ",evaluate_model(x_test, y_test))
print("VALIDATION SCORE :-",evaluate_model(x_val, y_val))


TRAINING SCORE :-  0.8010396747755794
TESTING SCORE :-  0.6669371225248105
VALIDATION SCORE :- 0.6887589646839729


In [84]:
model = Pipeline(steps=[
    ("pre", preprocessor),
    ("XG Boost", DecisionTreeRegressor())
])

model.fit(X_data, y_data)

print("TRAINING SCORE :- ",evaluate_model(X_data, y_data))
print("TESTING SCORE :- ",evaluate_model(x_test, y_test))
print("VALIDATION SCORE :-",evaluate_model(x_val, y_val))


TRAINING SCORE :-  0.9216479011209151
TESTING SCORE :-  0.6103335281655127
VALIDATION SCORE :- 0.6075856719493617


####  HYPERPARAMETER TUNING USING RANDOM SEARCH

In [None]:

# import xgboost as xgb

# xgboost_model = xgb.XGBRegressor()

# param_dist = {
#     'n_estimators': [100, 200, 300, 400],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'max_depth': [3, 4, 5, 6],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'gamma': [0, 0.1, 0.2],
#     'reg_alpha': [0, 0.1, 1],
#     'reg_lambda': [1, 1.5, 2]
# }

# random_search = RandomizedSearchCV(
#     xgboost_model,
#     param_distributions=param_dist,
#     n_iter=50,  
#     scoring='neg_mean_squared_error',  
#     cv=5,  
#     verbose=3,
#     n_jobs=-1,
#     random_state=42
# )

# random_search.fit(transformed_df, y_data)

# print("Best Hyperparameters: ", random_search.best_params_)


In [85]:
best_param = {'subsample': 0.8,
              'reg_lambda': 2,
              'reg_alpha': 1,
              'n_estimators': 300,
              'max_depth': 6,
              'learning_rate': 0.1,
              'gamma': 0,
              'colsample_bytree': 0.6
}

#### At last, training our model using all the best hyperparameters that we get using random search

In [86]:

model = Pipeline(steps=[
    ("pre", preprocessor),
    ("XG Boost", XGBRegressor(**best_param))
])

model.fit(X_data, y_data)

print("TRAINING SCORE :- ",evaluate_model(X_data, y_data))
print("TESTING SCORE :- ",evaluate_model(x_test, y_test))
print("VALIDATION SCORE :-",evaluate_model(x_val, y_val))

joblib.dump(model, "model.joblib")

TRAINING SCORE :-  0.8570930361747742
TESTING SCORE :-  0.7372660040855408
VALIDATION SCORE :- 0.7355228662490845


In [89]:
joblib.dump(model, "model.joblib")

['model.joblib']

In [15]:
import joblib

In [90]:
saved_model = joblib.load('C:/Users/shrey/Desktop/DEEP LEARNING PROJECTS/3. flight Price Prediction/Notebooks/model.joblib')

In [91]:
saved_model

In [92]:
from sklearn.pipeline import Pipeline

print(isinstance(saved_model, Pipeline))


True
