# Flight fare prediction

* importing all needed packages

1. Import Data 
2. EDA
3. Feature Engineering
4. Model Building
5. Model Evaluation
6. HyperParameter Tuning
7. Conclusion

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,roc_auc_score

## 1. import data

In [5]:
df = pd.read_excel('data/Data_Train.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'data/Data_Train.xlsx'

In [None]:
df.head()

## 2. EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.histplot(df.Price,kde=True)

In [None]:
sns.boxplot(df.Price)

In [None]:
df.head()

In [None]:
fig = plt.figure(figsize=(12,6))
plt.xticks(rotation=90)
sns.countplot(x=df.Airline,)
plt.show()

In [None]:
df.groupby(['Airline']).mean().plot(kind='bar',color='orange')

Jet Airways Business make sense for the outliers

In [None]:
df[df['Airline'] != 'Jet Airways Business'].groupby(['Airline']).mean().plot(kind='bar',color='green')

In [None]:
df.head()

In [None]:
sns.catplot(y="Price",x="Airline",data=df.sort_values("Price",ascending=False),kind="boxen",height=6,aspect=3)


In [None]:
sns.catplot(y="Price",x="Source",data = df,kind="bar",palette="blend:#7AB,#EDA")

In [None]:
sns.catplot(y="Price",x="Destination",data = df,kind="bar",palette="blend:#7AB,#EDA")

In [None]:
sns.catplot(y="Price",x="Total_Stops",data = df,kind="bar",palette="blend:#7AB,#EDA")

In [None]:
sns.countplot(x = df.Total_Stops,palette="flare")

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,6))
sns.catplot(y="Price",x="Additional_Info",data = df,kind="bar",palette="blend:#7AB,#EDA")
plt.show()


## 3. feature engineering

In [None]:
df.head()

handle date and time

In [None]:
df["Journey_day"]=pd.to_datetime(df["Date_of_Journey"],format="%d/%m/%Y").dt.day
df["Journey_month"]=pd.to_datetime(df["Date_of_Journey"],format="%d/%m/%Y").dt.month



In [None]:
df

In [None]:
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_min"] = pd.to_datetime(df["Dep_Time"]).dt.minute


In [None]:
df

In [None]:
df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arr_min"] = pd.to_datetime(df["Arrival_Time"]).dt.minute

In [None]:
df

In [None]:
dur_hour=[]
dur_min=[]
for i in df.Duration:
    if 'h'in i and 'm' in i:
        dur_hour.append(int(i.split('h')[0]))
        dur_min.append(int(i.split('h')[1][:-1]))
    elif 'h' in i:
        dur_hour.append(int(i.split('h')[0]))
        dur_min.append(int(0))
        
    else:
        dur_hour.append(int(0))
        dur_min.append(int(i[:-1]))
        
        

In [None]:
df['Duration_hour'] = dur_hour
df['Duration_min'] = dur_min

In [None]:
 df

In [None]:
df.drop(columns=['Date_of_Journey','Route','Dep_Time','Arrival_Time','Duration','Additional_Info'],axis=1,inplace=True)

In [None]:
df

In [None]:
df['Source'] = df.Source.apply(lambda x: "Source_" + x)
df['Destination'] = df.Destination.apply(lambda x: "Destination_" + x)

In [None]:
sns.scatterplot(x=df.Dep_hour,y=df.Price,hue=df.Airline)

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
df

In [None]:
df['Total_Stops'] = df.Total_Stops.replace({"non-stop":0,"1 stop" : 1,"2 stops":2,"3 stops":3,"4 stops":4})

In [None]:
df

In [None]:
df.Total_Stops.unique()

In [None]:
df.isnull().sum()

In [None]:
df['Total_Stops'] = df.Total_Stops.fillna(df.Total_Stops.median())

In [None]:
df['Total_Stops'] = df.Total_Stops.apply(lambda x: int(x))

In [None]:
df

In [None]:
df = pd.concat([df,pd.get_dummies(df.Airline,drop_first=True)],axis=1)

In [None]:
df

In [None]:
df = pd.concat([df,pd.get_dummies(df.Source,drop_first=True)],axis=1)

In [None]:
df = pd.concat([df,pd.get_dummies(df.Destination,drop_first=True)],axis=1)

In [None]:
df

In [None]:
df.drop(columns=['Airline','Source','Destination'],axis=1,inplace=True)

In [None]:
df

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True)

In [None]:
X,y = df.drop(['Price'],axis=1), df['Price']

In [None]:
X

In [None]:
y

In [None]:
sc = StandardScaler()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train = sc.fit_transform(X_train)

In [None]:
X_test = sc.transform(X_test)

## 5. model building

In [None]:
models = {
    "random_forests" : RandomForestRegressor(),
    "decision_tree" : DecisionTreeRegressor(),
    "gradient_boosting" : GradientBoostingRegressor(),
    "svr" : SVR(),
    "KNN" : KNeighborsRegressor()
}

In [None]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # train model

    # make predictions

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test dataset

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

        

we will select Xgboost and hyperparameter tune it

In [None]:
XGBRegressor()

In [None]:
params = {
    'model__n_estimators': [100, 50, 80],
    'model__max_depth': [3, 2, 1],
    'model__learning_rate': [0.05, 0.01, 0.20],
    }

In [None]:
gs = RandomizedSearchCV(XGBRegressor(),param_distributions=params,verbose=True,n_jobs=-1)

In [None]:
gs.fit(X_train,y_train)

In [None]:
pre = gs.predict(X_test)

In [None]:
r2_score(y_test,pre)

In [None]:
p1 = xg.predict(X_test)

In [None]:
r2_score(y_test,p1)

In [None]:
import pickle

In [None]:
with open('model.pickle','wb') as fl:
    pickle.dump(gs,fl)

In [None]:
with open('model.pickle','rb') as rf:
    g = pickle.load(rf)

In [None]:
X

In [None]:
df

In [None]:
test_file = pd.read_excel('Test_set.xlsx')

In [None]:
test_file

In [None]:
data = test_file.iloc[1]

In [None]:
data = pd.DataFrame(data).T

In [None]:
def process(df):
    df["Journey_day"]=pd.to_datetime(df["Date_of_Journey"],format="%d/%m/%Y").dt.day
    df["Journey_month"]=pd.to_datetime(df["Date_of_Journey"],format="%d/%m/%Y").dt.month

    df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
    df["Dep_min"] = pd.to_datetime(df["Dep_Time"]).dt.minute

    df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
    df["Arr_min"] = pd.to_datetime(df["Arrival_Time"]).dt.minute

    dur_hour=[]
    dur_min=[]
    for i in df.Duration:
        if 'h'in i and 'm' in i:
            dur_hour.append(int(i.split('h')[0]))
            dur_min.append(int(i.split('h')[1][:-1]))
        elif 'h' in i:
            dur_hour.append(int(i.split('h')[0]))
            dur_min.append(int(0))

        else:
            dur_hour.append(int(0))
            dur_min.append(int(i[:-1]))


    df['Duration_hour'] = dur_hour
    df['Duration_min'] = dur_min

    df.drop(columns=['Date_of_Journey','Route','Dep_Time','Arrival_Time','Duration','Additional_Info'],axis=1,inplace=True)

    df['Source'] = df.Source.apply(lambda x: "Source_" + x)
    df['Destination'] = df.Destination.apply(lambda x: "Destination_" + x)


    df['Total_Stops'] = df.Total_Stops.replace({"non-stop":0,"1 stop" : 1,"2 stops":2,"3 stops":3,"4 stops":4})

    df['Total_Stops'] = df.Total_Stops.fillna(df.Total_Stops.median())

    df['Total_Stops'] = df.Total_Stops.apply(lambda x: int(x))

    df = pd.concat([df,pd.get_dummies(df.Airline,drop_first=True)],axis=1)

    df = pd.concat([df,pd.get_dummies(df.Source,drop_first=True)],axis=1)

    df = pd.concat([df,pd.get_dummies(df.Destination,drop_first=True)],axis=1)



    fd = pd.read_csv('test_samp.csv')

    for i in fd.columns:
        df[i] = 0
        
    df.drop(columns=['Airline','Source','Destination'],axis=1,inplace=True)


    return df

In [None]:
ndf = pd.read_excel('Data_Train.xlsx')

In [None]:
data = ndf.iloc[1]

In [None]:
price = data.Price

In [None]:
data.drop(['Price'],inplace=True)

In [None]:
fd = process(pd.DataFrame(data).T)

In [None]:
p = sc.transform(fd)

In [None]:
gs.predict(p)

In [None]:
price

In [None]:
with open('scaler.pickle','wb') as file:
    pickle.dump(sc,file)