In [None]:
# Importing libraries

: 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

: 

In [None]:
# Reading the dataset

: 

In [None]:
df = pd.read_csv('Clean_Dataset.csv')

df.head()

: 

In [None]:
# 'Unnamed' column is of no use so we are dropping it
df = df.drop('Unnamed: 0', axis=1)
df.head()


: 

In [None]:
df.shape

: 

In the data, there are 300153 observations with 11 features.



In [None]:
df.info()

: 

In [None]:
df.isna().sum()

: 

In [None]:
# There is no missing value in the data.

: 

In [None]:
numerical_features = [numeric for numeric in df.columns if df[numeric].dtype != 'O']
df[numerical_features].head()

: 

In [None]:
# There are 3 numeric features in the data: duration, days_left and price.

: 

In [None]:
categorical_features = [categoric for categoric in df.columns if df[categoric].dtype == 'O']
df[categorical_features].head()

: 

In [None]:
# There are 8 categorical features in the data: airline, flight, source_city, departure_time, stops, arrival_time, destination_city, class

: 

## Statistical interpretation of data

In [None]:
df.describe()

: 

In [None]:
print (df['price'].median())

: 

In [None]:
# Median of the price of tickets are 7425.0

: 

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
sns.boxplot(x = 'price', data = df, color="purple")
plt.title('Price Ranges',fontsize=16)
plt.xlabel('Price',fontsize=12)

plt.subplot(1,2,2)
sns.histplot(x = 'price', data = df, kde = True, color="purple")
plt.title('Histogram',fontsize=16)
plt.xlabel('Price',fontsize=12)
plt.ylabel('The Number of Flights',fontsize=12)
plt.show()

: 

In [None]:
# As the mean price is 20890, the median is 7425. 
# The histogram graph shows a right-skewed distribution. The majority of prices in the data are between 1000 and 20000, on the other hand the tail of the distribution extends far past these peaks. 
# This can be related with the price difference between economy and business class.

: 

## Variation of ticket price between Economy and Business class

In [None]:
# To understand the price difference between the economy and business class, price histogram for economy and business seperately need to be plotted

: 

In [None]:
df1 = df.loc[df["class"]=='Economy']
df2 = df.loc[df["class"]=='Business']

: 

In [None]:
print (df1['price'].median())
print (df2['price'].median())

: 

In [None]:
print (df1['price'].mean())
print (df2['price'].mean())

: 

In [None]:
# Median and mean price of economy class is 5772.0 and 6572.34 respectively.
# Median and mean price of business class is 53164.0 and 52540.08 respectively.  

: 

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize = (15,10))

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)

plt.subplot(2,2,1)
sns.boxplot(y = 'class', x='price', data = df.loc[df["class"]=='Economy'], color="steelblue")
plt.title('Price Range of Economy Class',fontsize=16)
plt.xlabel('Price',fontsize=12)

plt.subplot(2,2,3)
sns.boxplot(y = 'class', x='price', data = df.loc[df["class"]=='Business'], color="firebrick")
plt.title('Price Range of Business Class',fontsize=16)
plt.xlabel('Price',fontsize=12)

plt.subplot(2,2,2)
sns.histplot(x = 'price', data = df.loc[df["class"]=='Economy'], kde = True, hue='class')
plt.title('Histogram of Economy Class',fontsize=16)
plt.xlabel('Price',fontsize=12)
plt.ylabel('The Number of Flights',fontsize=12)

plt.subplot(2,2,4)
sns.histplot(x = 'price', data = df.loc[df["class"]=='Business'], kde = True, hue='class', palette='rocket')
plt.title('Histogram of Business Class',fontsize=16)
plt.xlabel('Price',fontsize=12)
plt.ylabel('The Number of Flights',fontsize=12)
plt.show()

: 

In [None]:
# Most of the prices in economy class are between 1000 and 8000. The histogram graph shows no skew distribution.
# Most of the prices in business class are between 12000 and 60000.The histogram graph shows no skew distribution.

: 

In [None]:
# It can be concluded that ticket prices vary between classes as the mean price of Business class tickets are almost 8 times of the mean price of Economy Class tickets.

: 

## Variation of ticket prices between Airlines

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,5))
sns.boxplot(x=df['airline'],y=df['price'],palette='husl')
plt.title('Price Ranges by Airlines',fontsize=16)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

In [None]:
# Vistara and Air India have wider Price range and they have expensive tickets than other airlines.
# Since there is price variance between the price of Economy and Business class tickets,this can be related with the distribution of economy and business tickets in the airlines flight options.

: 

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
sns.boxplot(x = 'airline', y='price', data = df.loc[df["class"]=='Economy'], palette='husl')
plt.title('Price Ranges in Economy Class by Airlines',fontsize=16)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Price',fontsize=12)

plt.subplot(1,2,2)
sns.boxplot(x = 'airline', y='price', data = df.loc[df["class"]=='Business'], palette='Set2')
plt.title('Price Ranges in Business Class by Airlines',fontsize=16)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

In [None]:
# As Economy Class tickets are offered by all 6 airlines, Business Class tickets are only available in two airline companies that Air India and Vistara.
# In Economy Class, the graph shows that AirAsia offers the cheapest tickets, as Vistara and Air India offer expensive tickets compared to other airlines.
# In Business Class, the graph shows that Vistara mostly offers expensive prices than Air India.

: 

## Flight duration and its effect on ticket prices

In [None]:
dfduration = df.groupby(['duration'])['price'].mean().reset_index()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,5))
sns.scatterplot(data=dfduration, x="duration", y="price")
sns.regplot(data=dfduration, x="duration", y="price", order=2)
plt.title('Ticket Price vs. Flight Duration',fontsize=16)
plt.xlabel('Duration',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

In [None]:
# The regression curve shows the relationship between duration and ticket price. According to the graph, ticket price increases as the flight duration increases till 20 hours. After 20 hours, the ticket price decreases as the flight duration increases.

: 

## Effect of stops on ticket prices

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize = (15,10))

plt.subplot(2,1,1)
sns.boxplot(x="airline", y="price",
            hue="stops", palette='husl',
            data=df.loc[df["class"]=='Economy'])
sns.despine(offset=10, trim=True)
plt.title('Price Ranges by the Number of Stops by Airlines in Economy Class',fontsize=16)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Price',fontsize=12)

plt.subplot(2,1,2)
sns.boxplot(x="airline", y="price",
            hue="stops", palette='husl',
            data=df.loc[df["class"]=='Business'])
sns.despine(offset=10, trim=True)
plt.title('Price Ranges by the Number of Stops by Airlines in Business Class',fontsize=16)
plt.xlabel('Airline',fontsize=12)
plt.ylabel('Price',fontsize=12)

plt.tight_layout(pad=2.0)

: 

In [None]:
# In Economy Class, the graph shows that the more stops there are, the higher ticket price there are for all airlines except for Air Asia. The ticket price ranges of Air Asia for different number of stops seem similar. For this reason, The Air Asia can be the low cost airline company.
# In Business Class, the graph shows that the more stops there are, the higher ticket price there are for all two airlines.

# For both classes, we can say that the number of stops effect the ticket price.

: 

## Effect of remaining days in departure on ticket prices

In [None]:
dfdaysleft = df.groupby(['days_left'])['price'].mean().reset_index()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,5))
sns.scatterplot(data=dfdaysleft, x="days_left", y="price")
plt.title('Ticket Price vs. Days Left for Departure',fontsize=16)
plt.xlabel('Days Left for Departure',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,5))
ax = plt.axes()
sns.regplot(x=dfdaysleft.loc[dfdaysleft["days_left"]==1].days_left, y=dfdaysleft.loc[dfdaysleft["days_left"]==1].price, fit_reg=False, ax=ax)
sns.regplot(x=dfdaysleft.loc[(dfdaysleft["days_left"]>1)&(dfdaysleft["days_left"]<20)].days_left, y=dfdaysleft.loc[(dfdaysleft["days_left"]>1)&(dfdaysleft["days_left"]<20)].price, fit_reg=True, ax=ax)
sns.regplot(x=dfdaysleft.loc[dfdaysleft["days_left"]>=20].days_left, y=dfdaysleft.loc[dfdaysleft["days_left"]>=20].price, fit_reg=True, ax=ax)
plt.title('Ticket Price vs. Days Left for Departure',fontsize=16)
plt.xlabel('Days Left for Departure',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

In [None]:
dfdaysleft2 = df.groupby(['days_left','airline'])['price'].mean().reset_index()
plt.figure(figsize=(15,5))
sns.scatterplot(data=dfdaysleft2,x='days_left',y='price',color='magenta',hue='airline',palette='husl')
plt.title('Days Left For Departure Vs. Ticket Price of each Airline',fontsize=16)
plt.legend(fontsize=8, loc='upper right')
plt.xlabel('Days Left for Departure',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.show()

: 

1. The scatter plots show the relationship between days left for the departure and ticket price. In the plots there are two patterns. The first pattern is stable ticket price between 20 and 50 days left for the departure. The second pattern is negative relationship between the days left and ticket price between 2 and 20 days left for the departure. For this reason, we can say that days left for the departure effects the ticket price when there are less than 20 days for departure.


2. Additionally, ticket prices of Vistara and Air India drop one day before the flight, as the ticket prices of other airlines keep increase as the day left for departure decreases to one day. This reverse pattern can be related to the fact that Vistara and Air India offer business class tickets, as other airlines offer only economy class tickets. The demand and fill rate change pattern from two days left to one day left can be different for business and economy class tickets.

**Do the departure time and arrival time effect ticket prices?**

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
sns.boxplot(x='departure_time',y='price',data=df,palette='husl')
plt.title('Price Ranges by Departure Time',fontsize=16)
plt.xlabel('Departure Time',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.subplot(2,1,2)
sns.boxplot(x='arrival_time',y='price',data=df,palette='husl')
plt.title('Price Ranges by Arrival Time',fontsize=16)
plt.xlabel('Arrival Time',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.tight_layout(pad=2.0)
plt.show()

: 

Flights leaving or arriving at late night have the cheaper prices compared to other times. In addition to late night, flights arriving in the early morning or leaving in the afternoon are cheaper than other options. On the other hand, flights leaving at night have the expensive prices compared to other times.

**Do the Source City and Destination City effect ticket prices?**

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
sns.boxplot(x='source_city',y='price',data=df,palette='husl')
plt.title('Price Ranges by the Source City',fontsize=16)
plt.xlabel('Source City',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.subplot(2,1,2)
sns.boxplot(x='destination_city',y='price',data=df,palette='husl')
plt.title('Price Ranges by the Destination City',fontsize=16)
plt.xlabel('Destination City',fontsize=12)
plt.ylabel('Price',fontsize=12)
plt.tight_layout(pad=2.0)
plt.show()

: 

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,5))
ax = sns.relplot(col="source_city", y="price", kind="line",x='destination_city', data=df, col_wrap=3, style='source_city')
ax.fig.subplots_adjust(top=0.9)
ax.fig.suptitle('Ticket Prices by Source City and Destination City',fontsize=16)
plt.show()

: 

Flights leaving from or arriving at Delhi have the cheaper prices compared to other source and destination cities. On the other hand, other source and destination cities have similar price ranges.

**Which features effect the ticket prices mostly?**

Firstly, I will conduct correlation analysis to understand linear relationship between features and price. Since there are 8 categoric variables in the data, I will encode them via dummy variables.

In [None]:
df_bk=df.copy()

: 

In [None]:
def preprocessing(df):
    df["stops"] = df["stops"].replace({'zero':0,'one':1,'two_or_more':2}).astype(int)
    df["class"] = df["class"].replace({'Economy':0,'Business':1}).astype(int)
    dummies_variables = ["airline","source_city","destination_city","departure_time","arrival_time"]
    dummies = pd.get_dummies(df[dummies_variables], drop_first= True)
    df = pd.concat([df,dummies],axis=1)
    df = df.drop(["flight","airline","source_city","destination_city","departure_time","arrival_time"],axis=1)   
    return df

: 

In [None]:
df_preprocessed = preprocessing(df)

: 

In [None]:
df_preprocessed.head()

: 

**Correlation Analysis¶**

In [None]:
df_preprocessed.corr().T

: 

In [None]:
mask = np.triu(np.ones_like(df_preprocessed.corr(), dtype=bool))
plt.figure(figsize = (20,20))
corrMatrix = df_preprocessed.corr()
sns.heatmap(corrMatrix, mask=mask, annot=True)
plt.title('Heatmap of Correlation Matrix',fontsize=16)
plt.show()

: 

According to Correlation matrix, there are strong correlation with Class and Price. It can be also concluded from the price range analysis in Class detail. In order to understand the predictive power of other features on ticket price, I will also implement mutual information.

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

: 

In [None]:
X = df_preprocessed.copy()
y = X.pop('price')

mi_scores = make_mi_scores(X, y)

print(mi_scores)

: 

In [None]:
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(10))

: 

# <p style="background-color:LightSteelBlue;font-size:100%;text-align:center;border-radius:10px 10px;height:40px;padding-top: 5px;"> 2. Predictive  Analytics </p>

In [None]:
df_bk=df.copy()


: 

**Encoding Categorical Data**

I will encode the categoric variables for the regression.

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=le.fit_transform(df[col])

: 

In [None]:
x=df.drop(['price'],axis=1)
y=df['price']

: 

Splitting the Data as Train and Test Sets

I will split the data into 2 parts as train and test sets.

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

: 

 Normalizing Data

* Before defining regression models, I will normalize the data to make every datapoint have the same scale so each feature will be equally important. I will use the Min-Max normalization.

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
x_train=mmscaler.fit_transform(x_train)
x_test=mmscaler.fit_transform(x_test)
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)

: 

Building the Regression Models


In [None]:
a={'Model Name':[], 'Mean_Absolute_Error_MAE':[] ,'Adj_R_Squared':[] ,'Root_Mean_Squared_Error_RMSE':[] ,'Mean_Absolute_Percentage_Error_MAPE':[] ,'Mean_Squared_Error_MSE':[] ,'Root_Mean_Squared_Log_Error_RMSLE':[] ,'R2_score':[]}
Results=pd.DataFrame(a)
Results.head()

: 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

modellr = LinearRegression()
modeldtr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelxgr = xgb.XGBRegressor()
modeletr = ExtraTreesRegressor()
modelbgr = BaggingRegressor()

MM = [modellr, modeldtr, modelrfr, modelxgr, modeletr, modelbgr]

for models in MM:
    
    models.fit(x_train, y_train)

    y_pred = models.predict(x_test)
    
    print('Model Name: ', models)

    from sklearn import metrics
    
    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Squared: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Squared' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    Results = Results.append(new_row, ignore_index=True)

: 

In [None]:
modeletr.fit(x_train, y_train)
    
y_pred = modeletr.predict(x_test)

: 

In [None]:
out=pd.DataFrame({'Price_actual':y_test,'Price_pred':y_pred})
result=df_bk.merge(out,left_index=True,right_index=True)

: 

In [None]:
plt.figure(figsize=(15,5))
sns.regplot(x='Price_actual',y='Price_pred',data=result,color='magenta')
plt.title('Actual Price  Vs.  Predicted Price ',fontsize=16)
plt.xlabel('Actual Price',fontsize=12)
plt.ylabel('Predicted Price',fontsize=12)
r_squared = round(metrics.r2_score(y_test, y_pred),6)
Mean_Absolute_Error_MAE =  round(metrics.mean_absolute_error(y_test, y_pred),3)
plt.text(90000,25000,'$ R^{2} $=' + str(r_squared),fontsize=14)
plt.text(90000,15000,'MAE =' + str(Mean_Absolute_Error_MAE),fontsize=14)
plt.show()

: 

conclusion

1. The ticket prices vary between classes as the mean price of Business class tickets are almost 8 times of the mean price of Economy Class tickets.

2. Vistara and Air India have wider price range than other airline companies and they have expensive tickets than others, as business class tickets are only available on Vistara and Air India. On the other hand, Air Asia offers the cheapest tickets among other airline companies. When we compare the ticket prices of Vistara and Air India, Vistara mostly offers expensive prices than Air India.

3. As the flight duration increases till 20 hours, ticket price also increases. After 20 hours, the ticket price decreases as the flight duration increases.

4. The more stops there are, the higher ticket price there are for all airlines, except for Air Asia. The ticket price ranges of Air Asia for different number of stops are similar. For this reason, The Air Asia can be the low cost airline company. 

5. As the number of days left for the departure decreases between 20 and 2 days, ticket price increases. On the other hand, ticket prices remain almost stable between 50 and 20 days left for the departure. Additionally, in business class, it is possible to find cheaper tickets in one day before the departure than the day before.

6. Flights leaving or arriving at late night have the cheaper prices compared to other times. In addition to late night, flights arriving in the early morning or leaving in the afternoon are cheaper than other options. On the other hand, flights leaving at night have the expensive prices compared to other times.

7. Flights leaving from or arriving at Delhi have the cheaper prices compared to other source and destination cities. On the other hand, other source and destination cities have similar price ranges.

8. According to the mutual information scores, knowing the duration, class, whether the airline is Vistara or Air India, the number of days left for departure, whether the source city or destination city is Delhi or Mumbai and the number of stops can result in better prediction of ticket prices.

9. According to the comparison of the result of regression models, Extra Trees Regressor, Random Forest Regressor and Bagging Regressor models have the higher R squared scores and adjusted R squared values, as they have lower error values compared to other models. Extra Trees Regressor gives the best result with R^2 score equals to 0.984700 and MAE score equals to 1151.718790.

: 

: 

: 