### Importing Libraries : 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score,precision_score, roc_auc_score

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso


from sklearn.tree import DecisionTreeClassifier

### Data Reading and Cleaning :

In [2]:

flight_06 = pd.read_csv('/Users/rak/Desktop/2006.csv')
flight_07 = pd.read_csv('/Users/rak/Desktop/2007.csv')

airports = pd.read_csv('/Users/rak/Desktop/airports.csv')
carriers = pd.read_csv('/Users/rak/Desktop/carriers.csv')
plane_data = pd.read_csv('/Users/rak/Desktop/plane-data.csv')

In [3]:
# Dropping rows with cancelled or diverted flights:

flight_06=flight_06[flight_06['Cancelled']==0] 
flight_06=flight_06[flight_06['Diverted']==0] 

flight_07=flight_07[flight_07['Cancelled']==0]
flight_07=flight_07[flight_07['Diverted']==0] 

In [4]:
# Merging the Dataframes:

flight_data = pd.concat([flight_06, flight_07],ignore_index=True)

In [5]:
# Creating TotalDelay to caluculate total delay of the flight:
flight_data['TotalDelay'] = flight_data['DepDelay'] + flight_data["ArrDelay"]


### Question 5 :

To answer this question, delay was predicted by classifying delay times in order to create classification models to predict significant or moderate delays and simply total delay was predicted by using regression models.

In [6]:
flight_data["TotalDelay"].describe()

count    1.427909e+07
mean     2.017482e+01
std      7.143139e+01
min     -1.196000e+03
25%     -1.200000e+01
50%     -1.000000e+00
75%      2.200000e+01
max      5.199000e+03
Name: TotalDelay, dtype: float64

The DelayClass column was created to classify the flights based on their Total Delay times. 
Total Delay less than 15 minutes was considered to be a moderate or no delay
Total Delay above 15 minutes was considered to be a significant delay.

In [7]:
# Counting flights with total delay less than 15 minutes
num_flights_delay_neg = len(flight_data[flight_data['TotalDelay']<15])

# Counting flights with total delay greater than 15 minutes
num_flights_delay_pos = len(flight_data[flight_data['TotalDelay']>15])

print("Number of flights with no or moderate delay:", num_flights_delay_neg)
print("Number of flights with significant delay:", num_flights_delay_pos)

Number of flights with no or moderate delay: 10006353
Number of flights with significant delay: 4159827


In [8]:
# Creating new list to classify total delay
delayclass = []

for delay in flight_data['TotalDelay']:
    if delay > 15:
        delayclass.append(1)    # For delay more than 15 mins
    else:
        delayclass.append(0)    # For moderate delays or flight on time 
        
flight_data['DelayClass'] = delayclass
flight_data

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,TotalDelay,DelayClass
0,2006,1,11,3,743.0,745,1024.0,1018,US,343,...,0,,0,0,0,0,0,0,4.0,0
1,2006,1,11,3,1053.0,1053,1313.0,1318,US,613,...,0,,0,0,0,0,0,0,-5.0,0
2,2006,1,11,3,1915.0,1915,2110.0,2133,US,617,...,0,,0,0,0,0,0,0,-23.0,0
3,2006,1,11,3,1753.0,1755,1925.0,1933,US,300,...,0,,0,0,0,0,0,0,-10.0,0
4,2006,1,11,3,824.0,832,1015.0,1015,US,765,...,0,,0,0,0,0,0,0,-8.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14279085,2007,12,15,6,1558.0,1605,1749.0,1736,DL,58,...,0,,0,0,0,0,0,0,6.0,0
14279086,2007,12,15,6,1902.0,1851,2110.0,2105,DL,59,...,0,,0,0,0,0,0,0,16.0,1
14279087,2007,12,15,6,1024.0,1025,1750.0,1735,DL,61,...,0,,0,0,0,15,0,0,14.0,0
14279088,2007,12,15,6,1353.0,1315,1658.0,1622,DL,62,...,0,,0,0,0,0,0,36,74.0,1


Selecting relevant features:

only relevant columns were selected to be used in the models.

UniqueCarrier column was dropped as it contains over 1000 unique carrier names.

Cancelled and diverted flights were also dropped as flight_data only contains flights that have not been diverted and cancelled.


In [9]:
# Creating a new dataframe 'class_model' with only relevant columns for classification models:
class_model = flight_data[['Month','DayOfWeek','DayofMonth','Origin','Dest','CRSElapsedTime','Distance','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay','DelayClass']]

# Creating a new dataframe 'reg_model' with only relevant columns for regression models:
reg_model = flight_data[['Month','DayOfWeek','DayofMonth','Origin','Dest','CRSElapsedTime','Distance','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay','TotalDelay']]

### Classificaiton Models:

Three models were used to predict whether the delay was moderate or significant
1. Random Forest Classifier
2. Decision Tree Classifier
3. Gradient Boosting

The three model's performace was assessed by training the model on the train set and evaluating its performance by calculating the following on the predictions made of the test set:
1. Accuracy - measures the proportion of correctly classified instances
2. Precision - measures the proportion of true positives out of predicted positives
3. Recall - measures the proportion of true positives out of actual positives
4. F1 Score - a weighted harmonic mean of precision and recall that provides a balance between the two

Preparing the data:

1. Creating dummy variables for Origin and Destination
2. Performing a Train Test Split

In [12]:
# Creating a sample around 10% of original data:
class_model= class_model.sample(n=150000, random_state = 40)


# Create dummy variables for 'Origin' and 'Dest' columns
origin_dummies = pd.get_dummies(class_model['Origin'], prefix='Origin')
dest_dummies = pd.get_dummies(class_model['Dest'], prefix='Dest')

# Add dummy variables to the 'class_model' dataframe
class_model = pd.concat([class_model, origin_dummies, dest_dummies], axis=1)

# Drop the 'Origin' and 'Dest' columns from the dataframe
class_model = class_model.drop(['Origin', 'Dest'], axis=1)

class_model
class_model.shape

(150000, 608)

In [13]:
class_model

Unnamed: 0,Month,DayOfWeek,DayofMonth,CRSElapsedTime,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,...,Dest_TYR,Dest_TYS,Dest_VLD,Dest_VPS,Dest_WRG,Dest_WYS,Dest_XNA,Dest_YAK,Dest_YKM,Dest_YUM
1578131,3,1,13,269.0,1745,2,0,22,0,115,...,0,0,0,0,0,0,0,0,0,0
7487713,1,3,24,150.0,762,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6370827,11,1,13,174.0,1086,0,105,5,0,0,...,0,0,0,0,0,0,0,0,0,0
8233476,3,3,28,125.0,697,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6459245,12,6,16,160.0,1072,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10045673,6,1,18,100.0,580,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8036912,2,6,24,75.0,236,28,0,0,0,37,...,0,0,0,0,0,0,0,0,0,0
4332522,8,7,6,132.0,728,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14032885,12,2,25,173.0,1024,50,0,43,0,68,...,0,0,0,0,0,0,0,0,0,0


Train Test Split:

Splitting a dataset into two separate sets, one for training the model and one for testing the model.
he training set is used to build the model, while the testing set is used to evaluate how well the model is able to generalize to new, unseen data. 

In [14]:
# Features
X = class_model.drop('DelayClass', axis=1)

# Target Variable
Y = class_model['DelayClass']

# Splitting the data into 70% and 30% test data:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#### Model 1 : Random Forest Classifier :

In [15]:
# Initializing Random Forest Claassifier:
model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train, y_train)

# Predicting on the test data
y_pred_forest = model2.predict(X_test)
prob2 = model2.predict_proba(X_test)
# Evaluating the model performance
accuracy_forest = accuracy_score(y_test, y_pred_forest)
precision_forest = precision_score(y_test, y_pred_forest, average='weighted')
recall_forest = recall_score(y_test, y_pred_forest, average='weighted')
f1_forest = f1_score(y_test, y_pred_forest, average='weighted')

print("Accuracy:", accuracy_forest)
print("Precision:", precision_forest)
print("Recall:", recall_forest)
print("F1 Score:", f1_forest)


Accuracy: 0.9290444444444445
Precision: 0.9312877125205553
Recall: 0.9290444444444445
F1 Score: 0.9268418014260665


#### Model 2:  Decision Tree classifier:

In [16]:
model3= DecisionTreeClassifier()
model3.fit(X_train,y_train)

# predicting 
y_pred_dec = model3.predict(X_test)
prob3 = model3.predict_proba(X_test)


# Evaluating the model performance:
accuracy_dec = accuracy_score(y_test, y_pred_dec)
precision_dec = precision_score(y_test,y_pred_dec,average='weighted')
recall_dec = recall_score(y_test, y_pred_dec, average='weighted')
f1_dec = f1_score(y_test, y_pred_dec, average='weighted')

print('Accuracy:', accuracy_dec)
print("Precision:", precision_dec)
print("Recall:", recall_dec)
print("F1 Score:", f1_dec)

Accuracy: 0.8812666666666666
Precision: 0.8812119661269328
Recall: 0.8812666666666666
F1 Score: 0.8812390977682752


#### Model 3: Gradient Boosting:

In [17]:


# Fitting the Model:
model4 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=1, random_state=0).fit(X_train, y_train)

# predicting
#prob4 = model4.predict_proba(X_test)

y_pred_g = model4.predict(X_test)
# Evaluating the model performance:
accuracy_g = accuracy_score(y_test, y_pred_g)
precision_g = precision_score(y_test,y_pred_g,average='weighted')
recall_g = recall_score(y_test, y_pred_g, average='weighted')
f1_g = f1_score(y_test, y_pred_g, average='weighted')

print('Accuracy:', accuracy_g)
print("Precision:", precision_g)
print("Recall:", recall_g)
print("F1 Score:", f1_g)

Accuracy: 0.9318888888888889
Precision: 0.9357030359645608
Recall: 0.9318888888888889
F1 Score: 0.9294408545550515


In [18]:
# printing Dataframe of Scores:
scores = {
    "accuracy": [accuracy_forest, accuracy_g, accuracy_dec],
    "precision": [precision_forest, precision_g, precision_dec],
    "recall": [recall_forest, recall_g, recall_dec],
    "F1 score": [f1_forest, f1_g, f1_dec]
}

# Create dataframe
class_scores = pd.DataFrame(scores, index=["Random Forest", "Gradient Boosting", "Decision Tree"])

# Display dataframe
print(class_scores)

               accuracy  precision    recall  F1 score
Random Forest  0.929044   0.931288  0.929044  0.926842
XGBoost        0.931889   0.935703  0.931889  0.929441
Decision Tree  0.881267   0.881212  0.881267  0.881239


### Regression Models:

Three models were used to predict the value of TotalDelay

1. Random Forest Regressor
2. Lasso Regression
3. Decision Tree Classifier

The three model's performace was assessed by training the model on the train set and evaluating its performance by calculating the following on the predictions made of the test set:

1. r-square - measure that represents the proportion of the variance in the dependent variable that is explained by the independent variables
2. Root Mean Square Error(rmse) - measure of the differences between the predicted and actual values of the dependent variable

Similar to the classification model the data for regression models also needed to be prepared by:
1. Creating dummy variables for Origin and Destination
2. Performing a Train Test Split

In [19]:
# Creating a sample around 10% of original data:
reg_model= reg_model.sample(n=150000, random_state = 42)


# Create dummy variables for 'Origin' and 'Dest' columns
origin_dummies = pd.get_dummies(reg_model['Origin'], prefix='Origin')
dest_dummies = pd.get_dummies(reg_model['Dest'], prefix='Dest')

# Add dummy variables to the 'reg_model' dataframe
reg_model = pd.concat([reg_model, origin_dummies, dest_dummies], axis=1)

# Drop the 'Origin' and 'Dest' columns from the dataframe
reg_model = reg_model.drop(['Origin', 'Dest'], axis=1)

reg_model
reg_model.shape

(150000, 608)

In [20]:
# Features
X = reg_model.drop('TotalDelay', axis=1)

# Target Variable
Y = reg_model['TotalDelay']

# Splitting the data into 70% and 30% test data:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#### Model 1: Random Forest Regressor:

In [21]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fitting the model:
rf_reg.fit(X_train, y_train)

# predicting:
y_pred_rf = rf_reg.predict(X_test)


rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)


print("RMSE: {:.2f}".format(rmse_rf))
print("R2: {:.2f}".format(r2_rf))


RMSE: 18.12
R2: 0.94


#### Model 2: Lasso Regression:

In [22]:
lasso = Lasso(alpha=0.1, random_state=42)

# Fitting the model:
lasso.fit(X_train, y_train)

# Predicting
y_pred_l = lasso.predict(X_test)


rmse_l = mean_squared_error(y_test, y_pred_l, squared=False)
print("RMSE:", rmse_l)


r2_l = r2_score(y_test, y_pred_l)
print("R2:", r2_l)


RMSE: 16.902318099731804
R2: 0.9454309727305391


#### Model 3: Gradient Boosting Regressor:

In [24]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fitting the model:
gbr.fit(X_train, y_train)

# Predicting:
y_pred_gbr = gbr.predict(X_test)

rmse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
print("RMSE:", rmse_gbr)

# Calculate R2 score
r2_gbr = r2_score(y_test, y_pred_gbr)
print("R2:", r2_gbr)


RMSE: 17.026028678209755
R2: 0.9446292518281002


In [25]:
# Printing dataframe of Scores:
scores = {
    "RMSE": [rmse_l, rmse_rf, rmse_gbr],
    "R2": [r2_l, r2_rf, r2_gbr]
}


reg_scores = pd.DataFrame(scores, index=["Lasso Regression", "Random Forest Regression", "Gradient Boosting Regression"])

# Display dataframe
print(reg_scores)

                                   RMSE        R2
Lasso Regression              16.902318  0.945431
Random Forest Regression      18.122606  0.937267
Gradient Boosting Regression  17.026029  0.944629
