In [1]:
import pandas as pd 

In [2]:
df=pd.read_csv('data/flight_price.csv')

In [47]:
df

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,1,5955
...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Morning,one,Evening,Hyderabad,Business,49,69265
300149,Vistara,Chennai,Afternoon,one,Night,Hyderabad,Business,49,77105
300150,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Business,49,79099
300151,Vistara,Chennai,Early_Morning,one,Evening,Hyderabad,Business,49,81585


In [3]:
df.drop(columns=['Unnamed: 0','flight','duration'],inplace=True)

In [6]:
X = df.drop(columns=['price'], axis = 1)
Y = df["price"]

In [46]:
numerical_columns = X.select_dtypes(exclude= 'object').columns
categorical_columns = X.select_dtypes(include= 'object').columns

In [8]:
numerical_columns

Index(['days_left'], dtype='object')

In [9]:
categorical_columns

Index(['airline', 'source_city', 'departure_time', 'stops', 'arrival_time',
       'destination_city', 'class'],
      dtype='object')

In [10]:
df["airline"].value_counts()

airline
Vistara      127859
Air_India     80892
Indigo        43120
GO_FIRST      23173
AirAsia       16098
SpiceJet       9011
Name: count, dtype: int64

In [11]:
df["source_city"].value_counts()

source_city
Delhi        61343
Mumbai       60896
Bangalore    52061
Kolkata      46347
Hyderabad    40806
Chennai      38700
Name: count, dtype: int64

In [13]:
df["destination_city"].value_counts()

destination_city
Mumbai       59097
Delhi        57360
Bangalore    51068
Kolkata      49534
Hyderabad    42726
Chennai      40368
Name: count, dtype: int64

In [14]:
df["departure_time"].value_counts()

departure_time
Morning          71146
Early_Morning    66790
Evening          65102
Night            48015
Afternoon        47794
Late_Night        1306
Name: count, dtype: int64

In [15]:
df["arrival_time"].value_counts()

arrival_time
Night            91538
Evening          78323
Morning          62735
Afternoon        38139
Early_Morning    15417
Late_Night       14001
Name: count, dtype: int64

In [16]:
df["class"].value_counts()

class
Economy     206666
Business     93487
Name: count, dtype: int64

In [17]:
df["stops"].value_counts()

stops
one            250863
zero            36004
two_or_more     13286
Name: count, dtype: int64

In [28]:
airline_categories = ["SpiceJet", "AirAsia", "GO_FIRST", "Indigo", "Air_India","Vistara"]
source_city_categories = ["Chennai", "Hyderabad", "Kolkata", "Bangalore", "Mumbai", "Delhi"]
destination_city_categories = ["Chennai", "Hyderabad", "Kolkata", "Bangalore","Delhi","Mumbai"]
departure_time_categories = ["Late_Night","Afternoon","Night","Evening","Early_Morning","Morning"]
arrival_time_categories = ["Late_Night", "Early_Morning", "Afternoon", "Morning", "Evening", "Night"]
class_categories =["Business","Economy"]
stops_categories = ["two_or_more","zero","one"]

In [19]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling (Numerical datatypes)
from sklearn.preprocessing import OrdinalEncoder # To rank the categorical variables
#Pipeline 
from sklearn.pipeline import Pipeline # To club everything together 
from sklearn.compose import ColumnTransformer # Begin the work 

In [32]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "median")), 
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "most_frequent")), 
        ("OrdinalEncoder", OrdinalEncoder(categories= 
                                          [airline_categories,
                                           source_city_categories,
                                           departure_time_categories,
                                           stops_categories,
                                           arrival_time_categories,
                                           destination_city_categories,
                                           class_categories])), 
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_columns), 
        ('cat_pipeline', cat_pipeline, categorical_columns)
    ]
)

In [21]:
from sklearn.model_selection import train_test_split


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size= 0.25, random_state= 45)

In [38]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()) 
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [39]:
X_train.head()

Unnamed: 0,num_pipeline__days_left,cat_pipeline__airline,cat_pipeline__source_city,cat_pipeline__departure_time,cat_pipeline__stops,cat_pipeline__arrival_time,cat_pipeline__destination_city,cat_pipeline__class
0,-0.147459,0.854078,1.307019,-1.570437,0.41333,-1.066096,1.342467,-1.487931
1,-1.622915,0.111906,-1.066034,-1.570437,0.41333,-2.485572,0.157864,0.672074
2,0.295178,0.111906,-0.47277,-0.85759,0.41333,-0.356357,-1.026738,-1.487931
3,-1.549142,0.854078,0.713756,1.280952,-1.574374,-1.066096,0.750166,0.672074
4,0.295178,-1.372439,-1.066034,0.568105,0.41333,0.353381,-0.434437,0.672074


In [41]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
regression = LinearRegression()
regression.fit(X_train, y_train)

In [42]:
regression.coef_

array([ -1807.46679004,   1198.10739739,   -410.42409902,    259.20685782,
         1121.5122594 ,    515.82529105,   -307.54811906, -20590.74468672])

In [43]:
regression.intercept_

20880.243903089096

In [44]:
import numpy as np 
def model_evaluation(true, predicted): 
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    return mae, mse, r2_square, rmse 

In [45]:
models = {
    "LinearRegression": LinearRegression(), 
    "Lasso": Lasso(), 
    "Ridge": Ridge()
}

model_list = []
r2_list = []

for i in range(len(list(models))): 
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae, mse, r2_square, rmse = model_evaluation(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model training performance")
    print(f"The MAE score is {mae}")
    print(f"The MSE score is {mse}")
    print(f"The R2 score is {r2_square}")
    print(f"The RMSE score is {rmse}")

    r2_list.append(r2_square)
    print("*"*35)
    print("\n")

LinearRegression
Model training performance
The MAE score is 4583.282309831316
The MSE score is 55879921.567849845
The R2 score is 0.8916699472799637
The RMSE score is 7475.287390318171
***********************************


Lasso
Model training performance
The MAE score is 4583.011684400775
The MSE score is 55880157.59543991
The R2 score is 0.8916694897116537
The RMSE score is 7475.303177493199
***********************************


Ridge
Model training performance
The MAE score is 4583.294896654157
The MSE score is 55879916.41834265
The R2 score is 0.8916699572629123
The RMSE score is 7475.287045882763
***********************************


