In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

In [None]:
train_path = "/content/drive/MyDrive/Data Science/Project-43 Flight Fare Prediction Using Auto SK Learn (Auto ML)/flight data.csv"
test_path = "/content/drive/MyDrive/Data Science/Project-43 Flight Fare Prediction Using Auto SK Learn (Auto ML)/flight test.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
df = pd.concat([train_df, test_df])

In [None]:
df.info()

In [None]:
df['Airline'].value_counts()

In [None]:
sns.catplot(data=df.sort_values('Price', ascending=False), x='Airline', y='Price',kind='boxen', height=6, aspect=3)

In [None]:
train_df['Source'].value_counts()

In [None]:
sns.catplot(data=df.sort_values('Price', ascending=False), x='Source', y='Price',kind='boxen', height=6, aspect=3)

In [None]:
df['Destination'].value_counts()

In [None]:
sns.catplot(data=df.sort_values('Price', ascending=False), x='Destination', y='Price',kind='boxen', height=6, aspect=3)

In [None]:
# checking Null values
df.isna().sum()

In [None]:
df['Price'].fillna(round(df['Price'].mean(),2), inplace=True)

In [None]:
train_df.dropna(inplace=True)

In [None]:
df['Total_Stops'].value_counts()

In [None]:
sns.catplot(data=df.sort_values('Price', ascending=False), x='Total_Stops', y='Price',kind='boxen', height=6, aspect=3)

## Feature Engineeing 

In [None]:
df.head(3)

In [None]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'])

In [None]:
df['Day'] = df['Date_of_Journey'].apply(lambda x: int(x.day))
df['Month'] = df['Date_of_Journey'].apply(lambda x: int(x.month))
# df['Year'] = df['Date_of_Journey'].apply(lambda x: x.year)

In [None]:
df.drop('Date_of_Journey', axis=1, inplace=True)

In [None]:
df['Hours'] = df['Arrival_Time'].apply(lambda x: int(x.split(" ")[0].split(':')[0]))
df['Minutes'] = df['Arrival_Time'].apply(lambda x: int(x.split(" ")[0].split(':')[1]))

In [None]:
df.drop('Arrival_Time', axis=1, inplace=True)

In [None]:
df['Route_1'] = df['Route'].str.split("→").str[0]
df['Route_2'] = df['Route'].str.split("→").str[1]
df['Route_3'] = df['Route'].str.split("→").str[2]
df['Route_4'] = df['Route'].str.split("→").str[3]
df['Route_5'] = df['Route'].str.split("→").str[4]

In [None]:
df.drop('Route', axis=1, inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
encoder = LabelEncoder()

In [None]:
df['Airline'] = encoder.fit_transform(df['Airline'])
df['Source'] = encoder.fit_transform(df['Source'])
df['Destination'] = encoder.fit_transform(df['Destination'])
df['Additional_Info'] = encoder.fit_transform(df['Additional_Info'])
df['Total_Stops'] = encoder.fit_transform(df['Total_Stops'])
df['Route_1'] = encoder.fit_transform(df['Route_1'])
df['Route_2'] = encoder.fit_transform(df['Route_2'])
df['Route_3'] = encoder.fit_transform(df['Route_3'])
df['Route_4'] = encoder.fit_transform(df['Route_4'])
df['Route_5'] = encoder.fit_transform(df['Route_5'])

In [None]:
df.drop(['Dep_Time', 'Duration'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True, linewidths=0.8, cmap='viridis')
plt.show()

## Feature Selection

In [None]:
# splitting data into dependent and independent 
x = df.drop('Price', axis=1)
y = df['Price'].values

In [None]:
# log price
y = np.log(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44)

In [None]:
model = SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))

In [None]:
model.fit(x_train, y_train)

In [None]:
model.get_support()

In [None]:
x.columns[model.get_support()]

## Building the models

In [None]:
all_models = [RandomForestRegressor, DecisionTreeRegressor, LinearRegression, Lasso, ExtraTreesRegressor]

In [None]:
def getModel_score(x):
  model = x().fit(x_train, y_train)
  print(type(model).__name__)
  print('Training Score of', model.score(x_train, y_train))
  print('Testing Score of', model.score(x_test, y_test))

In [None]:
for i in all_models:
  getModel_score(i)
  print("-----"*10)

In [None]:
model = RandomForestRegressor(n_estimators=300)
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(y_test-y_pred)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.show()

## Hypertuning the model

In [None]:
param_grid = {
 'ccp_alpha': [1, 0.5,  0.25, 0.1, 0.0],
 'criterion': ["squared_error", "absolute_error", "poisson"],
 'max_depth': [int(x) for x in np.linspace(5, 30, 6)],
 'max_features': ["auto", "sqrt", "log2"],
 'min_samples_leaf': [1, 2, 3, 5, 8, 10],
 'min_samples_split': [1, 2, 4, 10, 25, 50, 100],
 'n_estimators': [int(x) for x in np.linspace(100, 1500, 15)]
 }

In [None]:
rf_random = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=param_grid, 
                               scoring='neg_mean_squared_error', n_iter=10, cv=5,
                               random_state=44, n_jobs=-1)

In [None]:
# rf_random.fit(x_test, y_test)

## Auto Scaler

In [None]:
# ! apt-get install swig -y
# ! pip install auto-sklearn
# ! pip install Cython numpy

In [None]:
import autosklearn
from autosklearn import regression as reg

In [None]:
x_train.head()

In [None]:
automl = reg.AutoSklearnRegressor(time_left_for_this_task=120, per_run_time_limit=30)

In [None]:
automl.fit(x_train, y_train)

In [None]:
automl.leaderboard()

In [None]:
automl.show_models()

In [None]:
y_predict_ = automl.predict(x_test)

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(y_test-y_predict_, kde=True)
plt.show()