In [None]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Loading the dataset
df = pd.read_csv("flightdata.csv")
pd.set_option('display.max_columns', None)
df.head()

In [None]:
#Dropping unnecessary columns
df.drop('Unnamed: 25', axis = 1, inplace = True)

In [None]:
#Dataset Info
df.info()

In [None]:
#Handling missing values
df.isnull().sum()

In [None]:
#Dropping the missing values
df.dropna(subset=['DEP_TIME','ARR_DELAY'], inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
#Data Visualization
ax = sns.countplot(y = df['ORIGIN'], order = df['ORIGIN'].value_counts().index);
ax.set_title("Airports w.r.t Depature Flights", fontsize = 16);
ax.set_xlabel("Number of Flights", fontsize = 14);
ax.set_ylabel("Airport Code", fontsize = 14);
ax.bar_label(ax.containers[0], label_type = 'center', color = 'white', size = 14);

In [None]:
ax = sns.countplot(y = df['DEST'], order = df['DEST'].value_counts().index);
ax.set_title("Airports w.r.t Arrival Flights", fontsize = 16);
ax.set_xlabel("Number of Flights", fontsize = 14);
ax.set_ylabel("Airport Code", fontsize = 14);
ax.bar_label(ax.containers[0], label_type = 'center', color = 'white', size = 14);

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,10))
ax[0].pie(df['DEP_DEL15'].value_counts(), labels = ['On Time', 'Delayed'], autopct = '%1.2f%%',startangle = 90, explode = (0,0.1));
ax[0].title.set_text("Ratio of Delayed Departure Flights");
ax[1].pie(df['ARR_DEL15'].value_counts(), labels = ['On Time','Delayed'], autopct = '%1.2f%%',startangle = 90, explode = (0,0.1));
ax[1].title.set_text("Ratio of Delayed Arrival Flights");

In [None]:
sns.heatmap(df.corr());

In [None]:
new_df = pd.get_dummies(df, columns = ['ORIGIN','DEST'])

In [None]:
new_df.head()

In [None]:
#Splitting into independant and dependant values
X = new_df[['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','ORIGIN_ATL','ORIGIN_DTW','ORIGIN_JFK','ORIGIN_MSP','ORIGIN_SEA','DEST_ATL','DEST_DTW','DEST_JFK','DEST_MSP','DEST_SEA','CRS_DEP_TIME','DEP_TIME','DEP_DEL15','CRS_ARR_TIME']]
y = new_df['ARR_DEL15']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.shape

In [None]:
y.shape

In [None]:
#Splitting into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
#Model Building 
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth = 4, min_samples_split = 4, random_state = 25)

In [None]:
#Model Training
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
#Model Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))

In [None]:
clf.predict([[1,4,1,0,1,0,0,0,0,0,0,0,1,1215,1236,1,1420]])

In [None]:
#Saving the model
import pickle
pickle.dump(clf, open('flightclf.pkl','wb'))