In [99]:
import pandas as pd
import numpy as np

df = pd.read_csv('Clean_Dataset.csv')
df.drop(df.columns[0],axis=1,inplace=True)

df.head()


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [100]:
df["class"].unique()

array(['Economy', 'Business'], dtype=object)

<h1>Data Pre-Processing</h1>

In [101]:
# check missing values
df.isna().sum()

# check categorical data

df['source_city'].unique()
df['arrival_time'].unique()
df['destination_city'].unique()
df['class'].unique()
df['stops'].unique()
df['airline'].unique()
df['departure_time'].unique()


# update dataframe 

df.drop('flight',axis=1,inplace=True)
df.drop('days_left',axis=1,inplace=True)
df.head()

# one hot encoding
df = pd.get_dummies(df,columns=["source_city","arrival_time","destination_city","class","stops","airline",'departure_time'])
df.head()

# convert price from indian Rupees to Euro
INREUR = 0.013
df['price'] = df['price'].apply(lambda x : x*0.013)

df.head()


Unnamed: 0,duration,price,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,arrival_time_Afternoon,arrival_time_Early_Morning,...,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,departure_time_Afternoon,departure_time_Early_Morning,departure_time_Evening,departure_time_Late_Night,departure_time_Morning,departure_time_Night
0,2.17,77.389,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,2.33,77.389,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,2.17,77.428,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,2.25,77.415,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,2.33,77.415,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


<h1>Machine Learning</h1>

In [104]:
# features and target 

x = df.drop('price',axis=1)
y = df['price']

x.dtypes

duration                        float64
source_city_Bangalore             uint8
source_city_Chennai               uint8
source_city_Delhi                 uint8
source_city_Hyderabad             uint8
source_city_Kolkata               uint8
source_city_Mumbai                uint8
arrival_time_Afternoon            uint8
arrival_time_Early_Morning        uint8
arrival_time_Evening              uint8
arrival_time_Late_Night           uint8
arrival_time_Morning              uint8
arrival_time_Night                uint8
destination_city_Bangalore        uint8
destination_city_Chennai          uint8
destination_city_Delhi            uint8
destination_city_Hyderabad        uint8
destination_city_Kolkata          uint8
destination_city_Mumbai           uint8
class_Business                    uint8
class_Economy                     uint8
stops_one                         uint8
stops_two_or_more                 uint8
stops_zero                        uint8
airline_AirAsia                   uint8


In [10]:
# train test split 
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

x_train.shape


(240122, 36)

<h2> Modelisation </h2>

In [19]:
# import ML libraries

from sklearn.svm import LinearSVR
from sklearn import linear_model

# model training

svr = LinearSVR()
ridge_reg = linear_model.Ridge()

svr.fit(x_train,y_train)
ridge_reg.fit(x_train,y_train)




In [20]:
# model evaluation
from sklearn.metrics import mean_squared_error


print(svr.score(x_test,y_test))
print(ridge_reg.score(x_test,y_test))

svr_pred = svr.predict(x_test)
rid_reg_pred = ridge_reg.predict(x_test)

print(mean_squared_error(y_test,svr_pred,squared=False))
print(mean_squared_error(y_test,rid_reg_pred,squared=False))

# conclusion : ridge regression plus intéréssant


0.8940418081324043
0.9053921442081219
96.09020280618066
90.79782397557423


In [30]:
# model Dump
import pickle

pickle.dump(svr,open("svr_model.sav",'wb'))
pickle.dump(svr,open("ridge_reg_model.sav",'wb'))

NameError: name 'svr' is not defined

In [106]:
import pickle

data = {
    
    "airline": "Indigo",
    "arrival_time": "Afternoon",
    "class": "Business",
    "departure_time": "Morning",
    "destination_city": "Bangalore",
    "duration": "1.5",
    "source_city": "Kolkata",
    "stops": "one"
}

feature_df = pd.json_normalize(data)

feature_df = pd.get_dummies(feature_df,columns=["source_city","arrival_time","destination_city","class","stops","airline",'departure_time'])
model = pickle.load(open('ridge_reg_model.sav','rb'))

empty_df = pd.DataFrame(columns=model.feature_names_in_)
feature_df = pd.concat([empty_df,feature_df])
feature_df.fillna(0,inplace=True)
feature_df.head()

feature_df.dtypes

duration                        object
source_city_Bangalore            int64
source_city_Chennai              int64
source_city_Delhi                int64
source_city_Hyderabad            int64
source_city_Kolkata              int64
source_city_Mumbai               int64
arrival_time_Afternoon           int64
arrival_time_Early_Morning       int64
arrival_time_Evening             int64
arrival_time_Late_Night          int64
arrival_time_Morning             int64
arrival_time_Night               int64
destination_city_Bangalore       int64
destination_city_Chennai         int64
destination_city_Delhi           int64
destination_city_Hyderabad       int64
destination_city_Kolkata         int64
destination_city_Mumbai          int64
class_Business                   int64
class_Economy                    int64
stops_one                        int64
stops_two_or_more                int64
stops_zero                       int64
airline_AirAsia                  int64
airline_Air_India        