In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,RobustScaler,OrdinalEncoder,StandardScaler,TargetEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.svm import SVR
from xgboost import XGBClassifier,XGBRegressor
from sklearn.metrics import r2_score
import seaborn as sns

In [None]:
df=pd.read_csv('/content/Data_Train.csv')
df.head(4)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [None]:
columns=df.columns
columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [None]:
for column in columns:
  print(column,":",df[column].nunique())

Airline : 12
Date_of_Journey : 44
Source : 5
Destination : 6
Route : 128
Dep_Time : 222
Arrival_Time : 1343
Duration : 368
Total_Stops : 5
Additional_Info : 10
Price : 1870


In [None]:
df["Dep_Time"] = pd.to_datetime(df["Dep_Time"], format="%H:%M")

df["Dep_Hour"] = df["Dep_Time"].dt.hour
df["Dep_Min"] = df["Dep_Time"].dt.minute

df.drop("Dep_Time", axis=1, inplace=True)

In [None]:
df["Arrival_Time"] = df["Arrival_Time"].apply(lambda x: x.split(" ")[0])
df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"], format="%H:%M")

df["Arrival_Hour"] = df["Arrival_Time"].dt.hour
df["Arrival_Min"] = df["Arrival_Time"].dt.minute

df.drop("Arrival_Time", axis=1, inplace=True)


In [None]:
df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y")
df["Journey_Day"] = df["Date_of_Journey"].dt.day
df["Journey_Month"] = df["Date_of_Journey"].dt.month
df.drop("Date_of_Journey", axis=1, inplace=True)

In [None]:
def convert_duration(duration):
    h, m = 0, 0
    if "h" in duration:
        h = int(duration.split("h")[0])
    if "m" in duration:
        m = int(duration.split("m")[-2].split()[-1])
    return h * 60 + m

df["Duration"] = df["Duration"].apply(convert_duration)

In [None]:
df.dropna(inplace=True)

In [None]:
x=df.drop('Price',axis=1)
y=df['Price']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
num_col=xtrain.select_dtypes(include='number').columns
cat_col=xtrain.select_dtypes(exclude='number').columns

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('scaler',RobustScaler(),num_col),
        ('ordinalencoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),cat_col),
    ]
)

In [None]:
decisiontreepipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',DecisionTreeRegressor())
    ]
)
decisiontreepipeline.fit(xtrain,ytrain)

In [None]:
random_forestpipeline=Pipeline(
     steps=[
        ('preprocessing',preprocessing),
        ('model',RandomForestRegressor(max_features='sqrt',max_depth=10,min_samples_split=10,min_samples_leaf=10,criterion='absolute_error'))
    ]
)
random_forestpipeline.fit(xtrain,ytrain)

In [None]:
random_forestpipeline.score(xtrain,ytrain)

0.7568597557449511

In [None]:
random_forestpipeline.score(xtest,ytest)

0.7601769120684279

In [None]:
grid_search_cv=GridSearchCV(
    estimator=decisiontreepipeline,
    param_grid={'model__max_depth':[None,5,10,15,20],
                'model__min_samples_split':[2,4,6,8,10],
                'model__min_samples_leaf':[1,3,5,7,10],
                'model__criterion':['absolute_error','squared_error']
    },
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring='neg_mean_absolute_error'
)
grid_search_cv.fit(xtrain,ytrain)

Fitting 3 folds for each of 250 candidates, totalling 750 fits


In [None]:
grid_search_cv.score(xtrain,ytrain)

-293.86459211562766

In [None]:
grid_search_cv.score(xtest,ytest)

-721.8106943138462

In [None]:
grid_search_cv.best_params_

{'model__criterion': 'squared_error',
 'model__max_depth': 20,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 6}

In [None]:
model=grid_search_cv.best_estimator_

In [None]:
model.fit(xtrain,ytrain)

In [None]:
model.score(xtrain,ytrain)

0.974475761140556

In [None]:
model.score(xtest,ytest)

0.7989967388815331

In [None]:
grid_search_cv.best_score_

np.float64(-816.0810429656079)

In [None]:
xgboostpipeline=Pipeline(
     steps=[('preprocessing',preprocessing),
        ('model',XGBRegressor())
        ]
     )
xgboostpipeline.fit(xtrain,ytrain)

In [None]:
xgboostpipeline.score(xtrain,ytrain)

0.974148690700531

In [None]:
xgboostpipeline.score(xtest,ytest)

0.8649906516075134