In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("/Users/tanya/Downloads/projects/Clean_Dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
#Dropping the first column since it isn't required.
df.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
df.isnull().sum()

#### No null values

In [None]:
df.duplicated().sum()

In [None]:
df["stops"].value_counts()

In [None]:
df["price"].agg(["min","max","median","mean","std"])

#### Since the median is smaller than mean, the distribution will be right skewed.

In [None]:
import plotly.express as px
fig = px.histogram(df, x="price")
fig.show()

In [None]:
#outlier analysis
sns.boxplot(data=df,x="price",color="orange")

#### There are outliers.

In [None]:
df.nunique()

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(df[col].unique())

### Ratio between the airlines.

In [None]:
df["airline"].value_counts()

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
sns.countplot(x='airline',data=df,palette="Greens_d",order=df['airline'].value_counts().index)
plt.title("Airline distribution")
plt.subplot(1,2,2)
plt.title('Airline distribution', fontsize=16)
df['airline'].value_counts().plot(kind='pie', legend=None, ylabel='', autopct='%1.1f%%')
plt.show()

### How is price from one airline to another?

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x=df['airline'],y=df['price'],palette='hls',order=df['airline'].value_counts().index)
plt.title('Airlines Vs Price',fontsize=15)
plt.xlabel('Airline',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()

### Ratio between the classes.

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.countplot(x='class',data=df,palette="Greens_d",order=df['class'].value_counts().index)
plt.title("Class distribution")
plt.subplot(2,2,2)
plt.title('Class distribution', fontsize=16)
df['class'].value_counts().plot(kind='pie', legend=None, ylabel='', autopct='%1.1f%%')
plt.legend(['Economy','Business'])
plt.show()

### How does the ticket price vary between Economy and Business class?

In [None]:
px.box(df,x='class',y='price',color='airline',title='Airline prices based on the class and company')

There are slight differences between each companies on this graph, AirAsia seems to have the cheapest flights when Air India and Vistara are more expensive. However it looks like Vistara's business tickets are a little more expensive than the Air India's ones.

### Number of stops

In [None]:
plt.figure(figsize=(3,3))
sns.countplot(x='stops',data=df)
plt.show()

### How Does the Ticket Price vary with the number of stops of a Flight?

In [None]:
fig, axs = plt.subplots (1, 2, gridspec_kw={'width_ratios': [5, 3]}, figsize=(25, 5))
sns.barplot(y = "price", x = "airline",hue="stops",data = df.loc[df["class"]=='Economy'].sort_values("price", ascending = False), ax=axs[0])
axs[0].set_title("Airline prices based on the number of stops  for economy",fontsize=20)
sns.barplot(y = "price", x = "airline",hue="stops",data = df.loc[df["class"]=='Business'].sort_values("price", ascending = False), ax=axs[1])
axs[1].set_title("Airline prices based on the number of stops  for business",fontsize=20)

### How the Ticket Price change based on the Departure Time and Arrival Time?

In [None]:
plt.figure(figsize=(24,10))
plt.subplot(1,2,1)
sns.boxplot(x='departure_time',y='price',data=df)
plt.title('Departure Time Vs Ticket Price',fontsize=20)
plt.xlabel('Departure Time',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.subplot(1,2,2)
sns.boxplot(x='arrival_time',y='price',data=df,palette='hls')
plt.title('Arrival Time Vs Ticket Price',fontsize=20)
plt.xlabel('Arrival Time',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()

### How the price changes with change in Source city and Destination city?

In [None]:
plt.figure(figsize=(24,10))
plt.subplot(1,2,1)
sns.boxplot(x='source_city',y='price',data=df)
plt.title('Source City Vs Ticket Price',fontsize=20)
plt.xlabel('Source City',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.subplot(1,2,2)
sns.boxplot(x='destination_city',y='price',data=df,palette='hls')
plt.title('Destination City Vs Ticket Price',fontsize=20)
plt.xlabel('Destination City',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()

### How does the price affected on the days left for Departure?

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(data=df,x='days_left',y='price',color='blue')
plt.title('Days Left For Departure Versus Ticket Price',fontsize=20)
plt.xlabel('Days Left for Departure',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(data=df,x='days_left',y='price',color='blue',hue='airline',palette='hls')
plt.title('Days Left For Departure Versus Ticket Price of each Airline',fontsize=15)
plt.legend(fontsize=12)
plt.xlabel('Days Left for Departure',fontsize=15)
plt.ylabel('Price',fontsize=15)
plt.show()

In [None]:
sns.lineplot(data=df, x='days_left', y='price', hue = 'departure_time')
plt.title('Days Left For Departure Versus Ticket Price wrt departure time')
plt.xlabel('Days Left for Departure', fontsize=15)
plt.ylabel('Price', fontsize=15)
plt.legend(bbox_to_anchor=(1.4, 1), loc='best', borderaxespad=0)
plt.show()

Prices for flights are highest when there are only 1-3 days left for departure, but decrease as the days left for departure increase.
Air India and Vistara are the most expensive airlines, and prices decrease as the days left for departure increase.
Late night departure times have lower prices compared to other departure times, but prices for late night arrival times are higher than evening arrivals.

### Does the price change with the duration of the flight?

In [None]:
df_temp = df.groupby(['duration'])['price'].mean().reset_index()

plt.figure(figsize=(15,6))
ax = sns.scatterplot(x="duration", y="price", data=df_temp).set_title("Average prizes depending on the duration",fontsize=15)

In [None]:
df_bk=df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=le.fit_transform(df[col])

In [None]:
x=df.drop(['price'],axis=1)
y=df['price']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
x_train=mmscaler.fit_transform(x_train)
x_test=mmscaler.fit_transform(x_test)
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test) 

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))

In [None]:
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
out=pd.DataFrame({'Price_actual':y_test,'Price_pred':y_pred})
result=df_bk.merge(out,left_index=True,right_index=True)

In [None]:
result.sample(10)