In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train=pd.read_excel('Data_Train.xlsx')
df_test=pd.read_excel('Test_set.xlsx')

df= pd.concat([df_train,df_test], axis=0)

In [None]:
df

In [None]:
# Null values

df.isnull().sum()

In [None]:
# Dropping null values 
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['Source'].value_counts()


In [None]:
sns.catplot(x='Source',y='Price',data=df.sort_values('Price'),kind="boxen")
plt.show()

In [None]:
# Checking average price for source 

mean_prices=df.groupby('Source')['Price'].mean().reset_index()

print(mean_prices)
my_colors = ["#5B84B1", "#A8C4E2", "#FFC857", "#ED7953", "#BC243C"]

ax=sns.barplot(x='Source',y='Price',data=mean_prices,palette=my_colors)

for index, row in mean_prices.iterrows():
    ax.text(row.name, row.Price, round(row.Price,2), color='black',ha='center')


ax.set_xlabel('Source')
ax.set_ylabel('Mean Price')
plt.xticks(rotation=90)
ax.set_title('Mean Price by Source')
plt.show()

In [None]:
# Prices variation based on Airline

df.groupby('Airline')['Price'].mean().reset_index()

In [None]:
mean_prices=df.groupby('Airline')['Price'].mean().reset_index()

plt.figure(figsize=(12,6))
ax=sns.barplot(x='Airline',y='Price',data=mean_prices)

for index, row in mean_prices.iterrows():
    ax.text(row.name, row.Price, round(row.Price,2), color='black',ha='center')
    
plt.xticks(rotation=90)
ax.set_xlabel('Airline')
ax.set_ylabel('Mean Price')
ax.set_title('Mean Price by Airlines')

**Feature Engineering**

In [None]:
df

In [None]:
df['Day']=pd.to_datetime(df['Date_of_Journey']).dt.day
df['Month']=pd.to_datetime(df['Date_of_Journey']).dt.month
df['Year']=pd.to_datetime(df['Date_of_Journey']).dt.year
df['Departure_hour']=pd.to_datetime(df['Dep_Time']).dt.hour
df['Departure_Minute']=pd.to_datetime(df['Dep_Time']).dt.minute

In [None]:
df.drop('Date_of_Journey',axis=1,inplace=True)
df.drop('Route',axis=1,inplace=True)
df.drop('Dep_Time',axis=1,inplace=True)
df

In [None]:
# Break down the arrival time
# take after 6 as nyt
# take between 5-10 as morning
# 10-6 as afternoon
# Saturday and sunday as weekend

In [None]:
df.drop('Arrival_Time',axis=1,inplace=True)
df

In [None]:
# Effect of total stops on the price

stops_mean=df.groupby('Total_Stops')['Price'].mean().reset_index()
stops_mean

In [None]:
Stops_mean=df.groupby('Total_Stops')['Price'].mean().reset_index()

colors = ["#1f77b4", "#2ca02c", "#d62728", "#9467bd", "#ff7f0e"]
ax=sns.barplot(x='Total_Stops',y='Price',data=Stops_mean,palette=colors)
for index, row in Stops_mean.iterrows():
    ax.text(row.name, row.Price, round(row.Price,2), color='black',ha='center')
    

plt.xticks(rotation=90)
ax.set_xlabel('Total_Stops')
ax.set_ylabel('Mean Price')
ax.set_title('Mean Price by Stops')

In [None]:
# dividing data in Morning Afternoon Evening Night

# Morning 6-11:59
# Afternoon 12-17:59
# Evening 18-23:59
# Night 00:00-5:59

df

In [None]:
def get_time_of_day(hour):
    if hour < 6:
        return 'Night'
    elif hour < 12:
        return 'Morning'
    elif hour < 18:
        return 'Afternoon'
    elif hour < 24:
        return 'Evening'
    else:
        return 'Unknown'


df['time_of_day'] = df.apply(lambda a: get_time_of_day(a['Departure_hour']), axis=1)
df

In [None]:
# Checking price wrt to time_of_day

time_of_day=df.groupby('time_of_day')['Price'].mean().reset_index()

colors = ["#1f77b4", "#2ca02c", "#d62728", "#9467bd"]
ax=sns.barplot(x='time_of_day',y='Price',data=time_of_day,palette=colors)

for index, row in time_of_day.iterrows():
    ax.text(row.name, row.Price, round(row.Price,2), color='black',ha='center')
    

plt.xticks(rotation=90)
ax.set_xlabel('Time_of_day')
ax.set_ylabel('Mean Price')
ax.set_title('Mean Price by Time of Day');


In [None]:
# Determining the Day of the week
df['date']=pd.to_datetime(df[['Day','Month','Year']])

df['day_of_week'] = df['date'].dt.day_name()

df.drop('date',axis=1,inplace=True)
df


In [None]:
day=df.groupby('day_of_week')['Price'].mean().reset_index()

print(day)

colors = ["#FFC300", "#FF5733", "#C70039", "#900C3F", "#581845", "#0A8F08", "#08718F"]

ax=sns.barplot(x='day_of_week',y='Price',data=day,palette=colors)

for index, row in day.iterrows():
    ax.text(row.name, row.Price, round(row.Price,2), color='black',ha='center')
    
plt.xticks(rotation=90)
ax.set_xlabel('Day of week ')
ax.set_ylabel('Mean Price')
ax.set_title('Mean Price by Days of Week')


In [None]:
from pivottablejs import pivot_ui

pivot_ui(df)

In [None]:
df.drop(['Duration','Additional_Info'],axis=1,inplace=True)

In [None]:
df

In [None]:
sns.heatmap(df.corr(),annot=True,fmt='.2g')

In [None]:
df=pd.get_dummies(df,drop_first=True)


In [None]:
df

In [None]:
# Building model
# Random Forest is Used

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
X=df.iloc[:,1:]
Y=df.iloc[:,0]

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=12)

In [None]:
regressor= RandomForestRegressor()
regressor.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
y_pred = regressor.predict(X_test)

# compute the accuracy score
r2 = r2_score(y_test, y_pred)
print(f'R_2 score is {r2*100:1.3f}')

In [None]:
# Checking out the feature importance

sns.distplot(y_test-y_pred)
plt.show()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
importance = ExtraTreesRegressor()
importance.fit(X, Y)

In [None]:
feature_importances = pd.Series(importance.feature_importances_, index=X.columns)
feature_importances.nlargest(20).plot(kind='barh')
plt.title('Feature Importance')
plt.show()

# Converting into Pickel file

In [None]:
import pickle


In [None]:
pickle.dump(regressor,open('farepredictor.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('farepredictor.pkl','rb'))

In [None]:
X_train.iloc[1,:]

In [None]:
import xesmf as xe

In [1]:
conda install -c conda-forge xesmf


Note: you may need to restart the kernel to use updated packages.


In [2]:
import xesmf as xe

C:\Users\sou09\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
C:\Users\sou09\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


ModuleNotFoundError: No module named 'ESMF'