In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('Clean_Dataset.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 27.5+ MB


In [5]:
data.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [6]:
# remove rows with business class
economy = data[data['class'] != 'Business'].copy(deep=True)
economy['class'].unique()

array(['Economy'], dtype=object)

In [7]:
economy.shape

(206666, 11)

In [8]:
def map_stops(x):
    if x == 'zero':
        return 0
    elif x == 'one':
        return 1
    else:
        return 2

In [9]:
economy['stops'] = economy['stops'].apply(map_stops)

In [10]:
# remove rows with 2 or more stops
zero_or_one_stop = economy[economy['stops'] <= 1].copy(deep=True)

In [11]:
zero_or_one_stop['stops'].unique()

array([0, 1])

In [12]:
zero_or_one_stop.shape

(194463, 11)

In [13]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] > lower_bound) & (df[column] < upper_bound)].copy(deep=True)

In [14]:
no_outliers = remove_outliers(zero_or_one_stop, 'price')
no_outliers = remove_outliers(no_outliers, 'duration')

In [15]:
no_outliers.shape

(179252, 11)

In [16]:
no_outliers.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,0,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,0,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,0,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,0,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,0,Morning,Mumbai,Economy,2.33,1,5955


In [17]:
trimmed = no_outliers.drop(columns=["airline", "flight", "class"]).copy(deep=True)

In [18]:
trimmed.head()

Unnamed: 0,source_city,departure_time,stops,arrival_time,destination_city,duration,days_left,price
0,Delhi,Evening,0,Night,Mumbai,2.17,1,5953
1,Delhi,Early_Morning,0,Morning,Mumbai,2.33,1,5953
2,Delhi,Early_Morning,0,Early_Morning,Mumbai,2.17,1,5956
3,Delhi,Morning,0,Afternoon,Mumbai,2.25,1,5955
4,Delhi,Morning,0,Morning,Mumbai,2.33,1,5955


In [19]:
trimmed['stops'] = trimmed['stops'].apply(lambda x: True if x == 0 else False)
trimmed.rename(columns={'stops': 'is_direct'}, inplace=True)

In [20]:
trimmed['duration'] = trimmed['duration'].apply(lambda x: int(x * 60))

In [21]:
trimmed.head()

Unnamed: 0,source_city,departure_time,is_direct,arrival_time,destination_city,duration,days_left,price
0,Delhi,Evening,True,Night,Mumbai,130,1,5953
1,Delhi,Early_Morning,True,Morning,Mumbai,139,1,5953
2,Delhi,Early_Morning,True,Early_Morning,Mumbai,130,1,5956
3,Delhi,Morning,True,Afternoon,Mumbai,135,1,5955
4,Delhi,Morning,True,Morning,Mumbai,139,1,5955


In [22]:
trimmed['source_city'].unique(), trimmed['departure_time'].unique(), trimmed['days_left'].unique()

(array(['Delhi', 'Mumbai', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai'],
       dtype=object),
 array(['Evening', 'Early_Morning', 'Morning', 'Afternoon', 'Night',
        'Late_Night'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]))

In [23]:
dummies = pd.get_dummies(trimmed)
dummies.head()

Unnamed: 0,is_direct,duration,days_left,price,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,...,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,True,130,1,5953,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1,True,139,1,5953,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
2,True,130,1,5956,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,True,135,1,5955,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,True,139,1,5955,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True


In [24]:
dummies.columns

Index(['is_direct', 'duration', 'days_left', 'price', 'source_city_Bangalore',
       'source_city_Chennai', 'source_city_Delhi', 'source_city_Hyderabad',
       'source_city_Kolkata', 'source_city_Mumbai', 'departure_time_Afternoon',
       'departure_time_Early_Morning', 'departure_time_Evening',
       'departure_time_Late_Night', 'departure_time_Morning',
       'departure_time_Night', 'arrival_time_Afternoon',
       'arrival_time_Early_Morning', 'arrival_time_Evening',
       'arrival_time_Late_Night', 'arrival_time_Morning', 'arrival_time_Night',
       'destination_city_Bangalore', 'destination_city_Chennai',
       'destination_city_Delhi', 'destination_city_Hyderabad',
       'destination_city_Kolkata', 'destination_city_Mumbai'],
      dtype='object')

In [25]:
X = dummies.drop(columns="price")
y = dummies["price"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler.transform(y_test.values.reshape(-1, 1))

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [29]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [30]:
y_pred = dt.predict(X_test)

In [31]:
mean_squared_error(y_test, y_pred)

np.float64(0.27390051740354954)

In [32]:
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1)

In [33]:
y_train = y_train.ravel()
y_test = y_test.ravel()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)

np.float64(0.16972111525618286)

In [None]:
import joblib


joblib.dump(rf, "random_forest.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']