In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Flight_Price.csv")

In [4]:
df.drop(columns="Unnamed: 0", inplace = True)
df

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 25.2+ MB


In [6]:
df.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [7]:
df.describe(include = "O")

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class
count,300153,300153,300153,300153,300153,300153,300153,300153
unique,6,1561,6,6,3,6,6,2
top,Vistara,UK-706,Delhi,Morning,one,Night,Mumbai,Economy
freq,127859,3235,61343,71146,250863,91538,59097,206666


In [8]:
df.columns

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
       'price'],
      dtype='object')

# Plotting

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
df_air = df.groupby("airline").mean().reset_index()

In [11]:
df_air

Unnamed: 0,airline,duration,days_left,price
0,AirAsia,8.941714,27.735184,4091.072742
1,Air_India,15.504235,25.497466,23507.019112
2,GO_FIRST,8.75538,27.430415,5652.007595
3,Indigo,5.795197,26.264309,5324.216303
4,SpiceJet,12.579767,24.12285,6179.278881
5,Vistara,13.326634,25.894532,30396.536302


# Supervised Machine Learning

In [12]:
target = "price"
X = df.drop(columns = ["price","flight"])
y = df[target]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [31]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score


In [45]:
classifiers = [LinearRegression(),
               DecisionTreeRegressor(),
               RandomForestRegressor()]
#              SVR(),
#              KNeighborsRegressor() 


num_features = ["duration", "days_left"]
num_transformer = make_pipeline(StandardScaler())

cat_features = ["airline","source_city","departure_time","stops","arrival_time","destination_city","class"]
cat_transformer = make_pipeline(OneHotEncoder(drop = "first"))

preprocessing = make_column_transformer(
                                        (num_transformer, num_features),
                                        (cat_transformer, cat_features)
                                        )

In [48]:
for clf in classifiers:
        pipe = make_pipeline(preprocessing, clf)
        grid = GridSearchCV(
                        estimator = pipe,
                        cv = 5,
                        scoring = "r2",
                        param_grid = {})
        grid.fit(X_train, y_train)
        score = grid.best_score_
        print(f"{clf} got r2 of: {score}\n")

LinearRegression() got r2 of: 0.9114058472606399

DecisionTreeRegressor() got r2 of: 0.9755195724364472

RandomForestRegressor() got r2 of: 0.9848584066727399



In [53]:
param_grid_rf = {
                "randomforestregressor__n_estimators": [75],
                "randomforestregressor__max_depth": [None,2,5,10,20,50],
                "randomforestregressor__min_samples_leaf": [1,2,3,4]
                    }

num_transformer = make_pipeline(StandardScaler())
cat_transformer = make_pipeline(OneHotEncoder(drop = "first"))
    
    
preprocessor = make_column_transformer(
                                (num_transformer, num_features),
                                (cat_transformer, cat_features))



classifier = RandomForestRegressor()
pipe = make_pipeline(preprocessor, classifier)
grid_rf = GridSearchCV(
                    estimator = pipe,
                    param_grid= param_grid_rf,
                    cv = 5,
                    scoring= "r2"
                      )
grid_rf.fit(X_train, y_train)

print("Best score: " , grid_rf.best_score_)
print("Best param: " , grid_rf.best_params_)

KeyboardInterrupt: 

In [59]:
classifier = RandomForestRegressor()
pipe = make_pipeline(preprocessor, classifier)
grid_rf = GridSearchCV(
                    estimator = pipe,
                    param_grid= {},
                    cv = 5,
                    scoring= "r2"
                      )
grid_rf.fit(X_test, y_test)

print("Best score: " , grid_rf.best_score_)
print("Best param: " , grid_rf.best_params_)

Best score:  0.9814977554302985
Best param:  {}


In [61]:
model = make_pipeline(preprocessing, RandomForestRegressor())
model.fit(X,y)

In [None]:
from joblib import dump

In [None]:
dump(model, "india_flight_price_model.joblib")