In [19]:
import pandas as pd

In [22]:
df=pd.read_csv('data/cleaned_data2.csv')

In [23]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month
0,0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3


In [25]:
df.columns

Index(['Unnamed: 0', 'Airline', 'Source', 'Destination', 'Total_Stops',
       'Price', 'Date', 'Month'],
      dtype='object')

In [26]:
df=df.drop(labels=['Unnamed: 0'],axis=1)

In [28]:
df.head(5)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3
1,Air India,Kolkata,Banglore,2 stops,7662,1,5
2,Jet Airways,Delhi,Cochin,2 stops,13882,9,6
3,IndiGo,Kolkata,Banglore,1 stop,6218,12,5
4,IndiGo,Banglore,New Delhi,1 stop,13302,1,3


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10462 entries, 0 to 10461
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Airline      10462 non-null  object
 1   Source       10462 non-null  object
 2   Destination  10462 non-null  object
 3   Total_Stops  10462 non-null  object
 4   Price        10462 non-null  int64 
 5   Date         10462 non-null  int64 
 6   Month        10462 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 572.3+ KB


In [30]:
## Independent and dependent features
X = df.drop(labels=['Price'],axis=1)


In [31]:
X

Unnamed: 0,Airline,Source,Destination,Total_Stops,Date,Month
0,IndiGo,Banglore,New Delhi,non-stop,24,3
1,Air India,Kolkata,Banglore,2 stops,1,5
2,Jet Airways,Delhi,Cochin,2 stops,9,6
3,IndiGo,Kolkata,Banglore,1 stop,12,5
4,IndiGo,Banglore,New Delhi,1 stop,1,3
...,...,...,...,...,...,...
10457,Air Asia,Kolkata,Banglore,non-stop,9,4
10458,Air India,Kolkata,Banglore,non-stop,27,4
10459,Jet Airways,Banglore,Delhi,non-stop,27,4
10460,Vistara,Banglore,New Delhi,non-stop,1,3


In [32]:
Y = df[['Price']]
Y

Unnamed: 0,Price
0,3897
1,7662
2,13882
3,6218
4,13302
...,...
10457,4107
10458,4145
10459,7229
10460,12648


In [34]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
categorical_cols 


Index(['Airline', 'Source', 'Destination', 'Total_Stops'], dtype='object')

In [35]:
numerical_cols = X.select_dtypes(exclude='object').columns
numerical_cols

Index(['Date', 'Month'], dtype='object')

In [36]:
df.head(1)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3


In [37]:
# Define the custom ranking for each ordinal variable
Airline_categories = ['Air Asia','GoAir','SpiceJet','IndiGo','Trujet','Air India','Vistara','Multiple carriers','Vistara Premium economy','Jet Airways','Multiple carriers Premium economy','Jet Airways Business']
Source_categories = ['Delhi', 'Mumbai', 'Banglore', 'Kolkata', 'Chennai']
Destination_categories = ['Delhi','New Delhi','Banglore','Kolkata','Hyderabad','Cochin']
TotalStops_categories = ['non-stop', '1 stop', '2 stops', '3 stops', '4 stops']

In [38]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [39]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [40]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Airline_categories,Source_categories,Destination_categories,TotalStops_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [41]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [42]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((7323, 6), (3139, 6), (7323, 1), (3139, 1))

In [43]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [44]:
X_train.head()

Unnamed: 0,num_pipeline__Date,num_pipeline__Month,cat_pipeline__Airline,cat_pipeline__Source,cat_pipeline__Destination,cat_pipeline__Total_Stops
0,1.614366,0.257571,-1.008152,-1.067411,1.033141,0.301044
1,1.614366,0.257571,1.113618,-1.067411,1.033141,0.301044
2,0.549822,0.257571,1.113618,-1.067411,1.033141,1.816193
3,0.194973,1.115711,0.406361,-1.067411,1.033141,0.301044
4,-0.869571,0.257571,-0.300896,1.141099,-0.591095,0.301044


In [45]:
## Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [49]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [50]:
## Train multiple models
## Model Evaluation
models={
    'LogisticRegression':LogisticRegression(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'RandomForestRegressor':RandomForestRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

  y = column_or_1d(y, warn=True)


LogisticRegression
Model Training Performance
RMSE: 3239.2818652517076
MAE: 1917.5345651481364
R2 score 52.460258133233474


KNeighborsRegressor
Model Training Performance
RMSE: 2366.2208476284527
MAE: 1462.753042370182
R2 score 74.63295421874534


DecisionTreeRegressor
Model Training Performance
RMSE: 2201.120529727784
MAE: 1389.7641947744155
R2 score 78.04937002701536




  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


GradientBoostingRegressor
Model Training Performance
RMSE: 2286.2813056926834
MAE: 1539.2107730609896
R2 score 76.31798398863016




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor
Model Training Performance
RMSE: 2171.488242941333
MAE: 1379.180406012843
R2 score 78.63640656049893




In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
randomgrid= {
    'n_estimators':[100, 120, 150, 170, 200, 220],
    'max_features':['auto','sqrt'],
    'max_depth':[5,10,15,20]
}

In [53]:
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=randomgrid,cv=3,verbose=2,n_jobs=-1,)

rf_random.fit(X_train,y_train)

# best parameter
rf_random.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


12 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

{'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 10}

In [56]:

#predicting the values
prediction = rf_random.predict(X_test)


In [57]:
r2_score(y_test,prediction)

0.7859532207341722