In [49]:
import pandas as pd

In [50]:
df = pd.read_excel("./data/clean_data.xlsx")
df.head()

Unnamed: 0,Airline,Price,Day,Month,Duration_In_Minutes,Total_Stops_Count,Dep_time_range,Source_Destination
0,IndiGo,3897,24,3,170,0,From 22 to 24,Banglore-New Delhi
1,Air India,7662,1,5,445,2,From 4 to 6,Kolkata-Banglore
2,Jet Airways,13882,9,6,1140,2,From 8 to 10,Delhi-Cochin
3,IndiGo,6218,12,5,325,1,From 18 to 20,Kolkata-Banglore
4,IndiGo,13302,1,3,285,1,From 16 to 18,Banglore-New Delhi


In [51]:
## Independent and dependent features
X = df.drop(labels=['Price'],axis=1)
Y = df[['Price']]

In [52]:

X.head()

Unnamed: 0,Airline,Day,Month,Duration_In_Minutes,Total_Stops_Count,Dep_time_range,Source_Destination
0,IndiGo,24,3,170,0,From 22 to 24,Banglore-New Delhi
1,Air India,1,5,445,2,From 4 to 6,Kolkata-Banglore
2,Jet Airways,9,6,1140,2,From 8 to 10,Delhi-Cochin
3,IndiGo,12,5,325,1,From 18 to 20,Kolkata-Banglore
4,IndiGo,1,3,285,1,From 16 to 18,Banglore-New Delhi


In [53]:

# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [54]:
categorical_cols

Index(['Airline', 'Dep_time_range', 'Source_Destination'], dtype='object')

In [55]:
numerical_cols

Index(['Day', 'Month', 'Duration_In_Minutes', 'Total_Stops_Count'], dtype='object')

In [56]:
airlines = ['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
            'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
            'Vistara Premium economy', 'Jet Airways Business',
            'Multiple carriers Premium economy', 'Trujet']

dep_time_ranges = ['From 22 to 24', 'From 4 to 6', 'From 8 to 10', 'From 18 to 20',
                   'From 16 to 18', 'From 10 to 12', 'From 20 to 22', 'From 14 to 16',
                   'From 6 to 8', 'From 12 to 14', 'From 2 to 4', 'From 0 to 2']
source_destination = ['Banglore-New Delhi', 'Kolkata-Banglore', 'Delhi-Cochin',
                      'Chennai-Kolkata', 'Banglore-Delhi', 'Mumbai-Hyderabad']

In [57]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [58]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(categories=[airlines,dep_time_ranges,source_destination],drop="first")),
    ('scaler',StandardScaler(with_mean=False))
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [59]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [60]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train).todense(),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test).todense(),columns=preprocessor.get_feature_names_out())

In [62]:
X_train.head()

Unnamed: 0,num_pipeline__Day,num_pipeline__Month,num_pipeline__Duration_In_Minutes,num_pipeline__Total_Stops_Count,cat_pipeline__Airline_Air India,cat_pipeline__Airline_Jet Airways,cat_pipeline__Airline_SpiceJet,cat_pipeline__Airline_Multiple carriers,cat_pipeline__Airline_GoAir,cat_pipeline__Airline_Vistara,...,cat_pipeline__Dep_time_range_From 14 to 16,cat_pipeline__Dep_time_range_From 6 to 8,cat_pipeline__Dep_time_range_From 12 to 14,cat_pipeline__Dep_time_range_From 2 to 4,cat_pipeline__Dep_time_range_From 0 to 2,cat_pipeline__Source_Destination_Kolkata-Banglore,cat_pipeline__Source_Destination_Delhi-Cochin,cat_pipeline__Source_Destination_Chennai-Kolkata,cat_pipeline__Source_Destination_Banglore-Delhi,cat_pipeline__Source_Destination_Mumbai-Hyderabad
0,1.614366,0.257571,0.3613,0.301044,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.799983,0.0,0.0,0.0,0.0,2.029546,0.0,0.0,0.0
1,1.614366,0.257571,-0.039324,0.301044,0.0,2.092296,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.029546,0.0,0.0,0.0
2,0.549822,0.257571,0.3613,1.816193,0.0,2.092296,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.029546,0.0,0.0,0.0
3,0.194973,1.115711,0.241113,0.301044,0.0,0.0,0.0,3.147872,0.0,0.0,...,0.0,0.0,4.320138,0.0,0.0,0.0,2.029546,0.0,0.0,0.0
4,-0.869571,0.257571,1.613251,0.301044,2.708852,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.239855,0.0,0.0,0.0,0.0


In [63]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [64]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [65]:
regression.coef_

array([[-632.92737209, -524.64112254,   13.27340624, 1845.0618316 ,
         499.59983064, 1997.28682985, -167.43110136, 1082.49843945,
         -39.63844704,  441.20980139,  -24.33956538,   37.65544606,
        1017.04406715,  132.66122788,  -36.26650213, -177.94683803,
         -99.39570258, -262.38836698,  -37.4634087 , -184.36035143,
        -124.72406202,  -67.03772936, -313.40025001, -128.17962997,
         -79.89488956,  -29.92018483, -790.17429908, -757.34030149,
        -307.96000535, -811.87489464, -783.88281074]])

In [66]:
regression.intercept_

array([9060.30903669])

In [67]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [68]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 2842.728251486809
MAE: 1932.2971015489497
R2 score 63.38744550294226


Lasso
Model Training Performance
RMSE: 2843.196759041111
MAE: 1931.2756351912296
R2 score 63.37537634236603


Ridge
Model Training Performance
RMSE: 2842.753904063977
MAE: 1932.2098081119605
R2 score 63.38678472179533


Elasticnet
Model Training Performance
RMSE: 3089.893255038874
MAE: 2083.0473298096663
R2 score 56.74400637408


