In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'flight-delays:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F810%2F1496%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240401%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240401T160826Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D192008a93993f411bf35329642c9c5dbad9e89189506b3e3eb18354267f6a7de33e12ba2da9da3089740bb8a71298abb019e7000cf4aad00ce9a6255d7b8aba4960bb55f626960a6e01200f8875b53ab9bb4f115153f1f1704d27d25278a2167596252a73467d9f9cf34dc6f49d8ce2b79805b43b5a8e9733ea455048007b17a954d37f37ccae859534e7f9fb3e0350c042dcd10659bb7293193251a7f62e6d0f90a5362d3b0819392dea39fa18d3bfab506397bb301bb3b26179c8826e5a363ae2b61d13345cb49116ec1e24df4be582ae37c9561426668dcedd52f580d03315fd69b14ac7f88f3f33466a626eddce4406157753d0567cca5bbb0b010d155b6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
flights=pd.read_csv("../input/flight-delays/flights.csv")
flights.head(10)

In [None]:
flights.shape

In [None]:
flights.isnull().sum() # Checking how many null values in each column in our data set

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x="CANCELLATION_REASON",data=flights)

In [None]:
#Reason for Cancellation of flight: A - Airline/Carrier; B - Weather; C - National Air System; D - Security
#We can observe from graph easily that mostly Whether is responsible for delays of flight.

In [None]:
sns.countplot(x="MONTH",hue="CANCELLATION_REASON",data=flights)

In [None]:
flights.isnull().sum()*100/flights.shape[0]

In [None]:
df_sample=flights

In [None]:
plt.figure(figsize=(10, 10))
axis = sns.countplot(x=df_sample['ORIGIN_AIRPORT'], data =df_sample,
              order=df_sample['ORIGIN_AIRPORT'].value_counts().iloc[:20].index)
axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha="right")
plt.tight_layout()
plt.show()

In [None]:
axis = plt.subplots(figsize=(10,14))
sns.despine(bottom=True, left=True)
# Observations with Scatter Plot
sns.stripplot(x="ARRIVAL_DELAY", y="AIRLINE",
              data = df_sample, dodge=True, jitter=True
            )
plt.show()

In [None]:
axis = plt.subplots(figsize=(10,14))
Name = df_sample["AIRLINE"].unique()
size = df_sample["AIRLINE"].value_counts()
plt.pie(size,labels=Name,autopct='%5.0f%%')
plt.show()

In [None]:
axis = plt.subplots(figsize=(20,14))
sns.heatmap(df_sample.corr(),annot = True)
plt.show()

In [None]:
# Very High Correlation Between Arrival Delay and Departure Delay¶
#It shows that maximum of the Arrival Delays are due to the Departure Delays.

In [None]:
corr=df_sample.corr()
corr

In [None]:
varaibles_to_remove=['YEAR','FLIGHT_NUMBER',
       'TAIL_NUMBER', 'DEPARTURE_TIME', 'TAXI_OUT',
       'WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME',
       'WHEELS_ON', 'TAXI_IN', 'ARRIVAL_TIME', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']

In [None]:
flights.drop(varaibles_to_remove,axis=1,inplace=True)

In [None]:
flights.columns

In [None]:
flights.drop('SCHEDULED_TIME',axis=1,inplace=True)

In [None]:
flights.drop('SCHEDULED_ARRIVAL',axis=1,inplace=True)

In [None]:
flights.columns

In [None]:
airport=pd.read_csv("../input/flight-delays/airports.csv")

In [None]:
airport.head()

In [None]:
flights.loc[~flights.ORIGIN_AIRPORT.isin(airport.IATA_CODE.values),'ORIGIN_AIRPORT']='OTHER'
flights.loc[~flights.DESTINATION_AIRPORT.isin(airport.IATA_CODE.values),'DESTINATION_AIRPORT']='OTHER'

In [None]:
flights.head()

In [None]:
flights.ORIGIN_AIRPORT.nunique()

In [None]:
flights.DESTINATION_AIRPORT.nunique()

In [None]:
flights.AIRLINE.nunique()

In [None]:
flights.columns

In [None]:
flights.shape

In [None]:
flights=flights.dropna()

In [None]:
flights.head()

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==1].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="SUNDAY"

In [None]:
flights.head(40)

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==2].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="MONDAY"

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==3].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="TUESDAY"

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==4].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="WEDNESDAY"

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==5].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="THRUSDAY"

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==6].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="FRIDAY"

In [None]:
row_indexes=flights[flights['DAY_OF_WEEK']==7].index

In [None]:
flights.loc[row_indexes,'DAY_OF_WEEK']="SATURDAY"

In [None]:
flights.head(40)

In [None]:
dd=pd.DataFrame(flights)

In [None]:
dums = ['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','DAY_OF_WEEK']

In [None]:
df_cat=pd.get_dummies(dd[dums],drop_first=True)

In [None]:
df_cat.columns

In [None]:
dd.columns

In [None]:
dd.drop("AIRLINE",axis=1,inplace=True)

In [None]:
dd.drop("ORIGIN_AIRPORT",axis=1,inplace=True)

In [None]:
dd.drop("DESTINATION_AIRPORT",axis=1,inplace=True)

In [None]:
dd.drop("DAY_OF_WEEK",axis=1,inplace=True)

In [None]:
import pandas as pd

In [None]:
df=pd.concat([dd,df_cat],axis=1)

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
final_data=df

In [None]:
final_data = final_data.sample(n=100000)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
final_data.head(10)

In [None]:
X=final_data.drop("DEPARTURE_DELAY",axis=1)
Y=final_data.DEPARTURE_DELAY

In [None]:
Y

In [None]:
X.head(10)

In [None]:
Y.head(10)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train,y_train)

In [None]:
y_pred = reg_rf.predict(X_test)

In [None]:
reg_rf.score(X_train,y_train)

In [None]:
reg_rf.score(X_test,y_test)

In [None]:
metrics.r2_score(y_test,y_pred)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,y_pred))
print('MSE:', metrics.mean_squared_error(y_test,y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

In [None]:
pp=pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
pp.head(10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
p=rf_random.predict(X_test)

In [None]:
metrics.r2_score(y_test,p)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,p))
print('MSE:', metrics.mean_squared_error(y_test,p))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,p)))

In [None]:
zz=pd.DataFrame({'Actual':y_test,'Predicted':p})
zz.head(30)

Boosting technique applying

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr=GradientBoostingRegressor(random_state=0)

In [None]:
GBR=gbr.fit(X_train,y_train)

In [None]:
pre =GBR.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,pre))
print('MSE:', metrics.mean_squared_error(y_test,pre))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,pre)))

In [None]:
metrics.r2_score(y_test,pre)

In [None]:
gg=pd.DataFrame({'Actual':y_test,'Predicted':pre})
gg.head(20)

In [None]:
def predict(MONTH, DAY,SCHEDULED_DEPARTURE,
       DISTANCE, ARRIVAL_DELAY,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DAY_OF_WEEK):
    AIRLINE_index = np.where(X.columns==AIRLINE)[0][0]
    ORIGIN_index = np.where(X.columns==ORIGIN_AIRPORT)[0][0]
    DESTINATION_index = np.where(X.columns==DESTINATION_AIRPORT)[0][0]
    DAY_OF_WEEK_index = np.where(X.columns==DAY_OF_WEEK)[0][0]
    x= np.zeros(len(X.columns))
    x[0] = MONTH
    x[1] = DAY
    x[2] = SCHEDULED_DEPARTURE
    x[3] = DISTANCE
    x[4] = ARRIVAL_DELAY
    if AIRLINE_index >=0:
        x[AIRLINE_index] = 1
    if ORIGIN_index >=0:
        x[ORIGIN_index] = 1
    if DESTINATION_index >=0:
        x[DESTINATION_index] = 1
    if  DAY_OF_WEEK_index >= 0:
        x[ DAY_OF_WEEK_index] = 1




    return gbr.predict([x])[0]

In [None]:
predict(5,6,1515,328,-8.0,'AIRLINE_OO','ORIGIN_AIRPORT_PHX','DESTINATION_AIRPORT_ABQ','DAY_OF_WEEK_TUESDAY')