# Data Science Challenge

In [40]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 101)

## Data Description

Column | Description
:---|:---
`id` | Identification number for the flight.
`dep_stn` | Departure point.
`arr_stn` | Arrival point.
`ac_code` | Aircraft Code. 
`dep_date` | Scheduled departure date.	
`arr_date` | Scheduled arrival date.
`weather` | Observed weather conditions at departure.
`hobbs_meter` | The time in hours that an aircraft has been in use.
`year_man` | Year of manufacturer for the aircraft.
`dep_country` | Departure country.
`arr_country` | Destination country.
`delay` | Number of minutes a flight was delayed.

## Data Wrangling & Visualization

In [41]:
# Dataset is already loaded below
data = pd.read_csv("train.csv")

In [42]:
data.head()

Unnamed: 0,id,dep_stn,arr_stn,ac_code,dep_date,arr_date,weather,hobbs_meter,year_man,dep_country,arr_country,delay
0,TU 0930,TUN,MRS,TU 32AIMD,2016-01-01 07:55:00,2016-01-01 09.30.00,sunny,4390,2004-08,Tunisia,France,18.0
1,TU 0526,TUN,DUS,TU 736IOQ,2016-01-01 08:20:00,2016-01-01 11.05.00,rainy,5882,2010-01,Tunisia,Germany,39.0
2,TU 0718,TUN,ORY,TU 320IMU,2016-01-01 10:05:00,2016-01-01 12.25.00,foggy,6117,2011-03,Tunisia,France,14.0
3,TU 0997,NCE,TUN,TU 320IMT,2016-01-01 10:15:00,2016-01-01 11.50.00,sunny,8941,2010-01,France,Tunisia,25.0
4,TU 0700,TUN,GVA,TU 320IMV,2016-01-01 12:40:00,2016-01-01 14.35.00,stormy,4301,2004-08,Tunisia,Switzerland,21.0


# Data Understanding (for handling categorical variables)
 1. We can see most the parameters in train.csv are categorical in nature.
 2. the count of different categories is very high in almost every variable
 3. therefore, we can't use one-hot encoding or label encoding instead we have to use value_counts approach.
 4. weather and year_man - these 2 variables have less number of different categories so we can use one-hot encoding for the conversion

In [43]:
data.columns

Index(['id', 'dep_stn', 'arr_stn', 'ac_code', 'dep_date', 'arr_date',
       'weather', 'hobbs_meter', 'year_man', 'dep_country', 'arr_country',
       'delay'],
      dtype='object')

In [44]:
# Covert Categorical Variable with Value Counts
def categorical_to_value_counts(data,col_var):
    data_frequency_map = data[col_var].value_counts().to_dict()
    data[col_var] = data[col_var].map(data_frequency_map)
    return data

### Applying Categorical to Value_Count operation on cols having high categorical features

In [45]:
# 1. for 'dep_stn'
data = categorical_to_value_counts(data,'dep_stn')
# 2. for 'arr_stn'
data = categorical_to_value_counts(data,'arr_stn')
# 3. for 'ac_code'
data = categorical_to_value_counts(data,'ac_code')
# 4. for 'dep_date'
data = categorical_to_value_counts(data,'dep_date')
# 5. for 'arr_date'
data = categorical_to_value_counts(data,'arr_date')
# 6. for 'hobbs_meter'
data = categorical_to_value_counts(data,'hobbs_meter')
# 7. for dep_country
data = categorical_to_value_counts(data,'dep_country')
# 8. for arr_country
data = categorical_to_value_counts(data,'arr_country')

### Applying One-Hot Encoding on cols having low number of categorical features

In [46]:
# One Hot Enocding for weather and year_man (run at last)
processed_data = pd.get_dummies(data,columns=['weather','year_man'])

In [47]:
processed_data.head()

Unnamed: 0,id,dep_stn,arr_stn,ac_code,dep_date,arr_date,hobbs_meter,dep_country,arr_country,delay,weather_cloudy,weather_foggy,weather_rainy,weather_stormy,weather_sunny,weather_windy,year_man_1999-11,year_man_2000-05,year_man_2003-02,year_man_2004-08,year_man_2007-05,year_man_2008-12,year_man_2009-04,year_man_2010-01,year_man_2011-03
0,TU 0930,2072,167,176,1,1,176,2471,1010,18.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
1,TU 0526,2072,46,112,1,1,112,2471,216,39.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,TU 0718,2072,421,248,1,1,248,2471,1010,14.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,TU 0997,120,1963,247,1,1,247,1051,2478,25.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
4,TU 0700,2072,59,256,1,1,256,2471,83,21.0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


# Checking Null Values in the processed data

In [48]:
processed_data.isnull().sum()

id                  0
dep_stn             0
arr_stn             0
ac_code             0
dep_date            0
arr_date            0
hobbs_meter         0
dep_country         0
arr_country         0
delay               0
weather_cloudy      0
weather_foggy       0
weather_rainy       0
weather_stormy      0
weather_sunny       0
weather_windy       0
year_man_1999-11    0
year_man_2000-05    0
year_man_2003-02    0
year_man_2004-08    0
year_man_2007-05    0
year_man_2008-12    0
year_man_2009-04    0
year_man_2010-01    0
year_man_2011-03    0
dtype: int64

In [49]:
processed_data.shape

(5000, 25)

# Splitting Data into Training and Validation set

In [50]:
df = processed_data.drop(['id'], axis=1)

In [51]:
X = df.drop(['delay'], axis=1)
Y = df['delay']

In [52]:
# # splitting X and y into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3,
                                                    random_state=1)

In [53]:
X_val.shape

(1500, 23)

# Modelling on the final processed data

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score,StratifiedKFold

# Linear Regression

In [55]:
lr=LinearRegression()

In [56]:
lr.fit(X_train,Y_train)

LinearRegression()

In [58]:
y_pred_lr =lr.predict(X_val)

In [59]:
y_pred_lr

array([19.75531882, 22.1836788 , 20.8946544 , ..., 20.84848086,
       21.30320399, 23.18123784])

# Evaluation Metrics for Linear Regression

In [60]:
# MAPE score
Accuracy= mean_absolute_percentage_error(Y_val,y_pred_lr)*100
print(" MAPE using Linear Regresiion is %.2f" %Accuracy)

 MAPE using Linear Regresiion is 87.10


## Random Forest

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5,10,15],
    'max_features': [2, 3, 5, 6],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [2,3,4,5],
    'n_estimators': [100,200,300,400,500]
}

In [62]:
# Model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [63]:
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)
grid_search.best_params_

Fitting 3 folds for each of 720 candidates, totalling 2160 fits
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.6s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=500; total time=   0.6s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=500; total time=   0.7s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END boo

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, mi

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.6s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, mi

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=500; total time=   0.7s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   0.6s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, min_samples_split=2, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, mi

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, mi

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.6s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, mi

[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=3, n_estimators=500; total time=   0.7s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=4, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=5, max_features=2, min_samples_leaf=3, mi

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 100}

In [None]:
print(grid_search.best_estimator_)

In [None]:
rfc=RandomForestRegressor(bootstrap=True,max_depth=5, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_samples_leaf=2, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
model_rf=rfc.fit(X_train,Y_train)

In [None]:
y_pred_rf = model_rf.predict(X_val)

In [None]:
y_pred_rf

# Evaluation Metrics for Random Forest

In [69]:
#MAPE
Accuracy_Using_MAPE = mean_absolute_percentage_error(Y_val,y_pred_rf)*100
print(" MAPE using Random Forest %.2f" %Accuracy_Using_MAPE)

 MAPE using Random Forest 86.89


# Support Vector Regressor

In [70]:
clr = SVR()

In [71]:
# fitting x samples and y classes 
model_svm= clr.fit(X_train,Y_train)

In [72]:
y_pred_svm=model_svm.predict(X_val)

# Evaluation Metrics for SVR

In [73]:
#MAPE
Accuracy_Using_MAPE = mean_absolute_percentage_error(Y_val,y_pred_svm)*100
print("MAPE using SVR is %.2f" %Accuracy_Using_MAPE)

MAPE using SVR is 76.63


# Optimized Model

Random Forest Regressor and Linear Regression are the optimized models as per the MAPE values out of all 3 models.
We will use 'Random Forest' for predicting the output as it is an ensemble model and prevents overfitting.

# Steps to process Test Data (test.csv) in the format required for prediction of output values

In [87]:
# Reading test.csv
test_data = pd.read_csv("test.csv")
test_data.shape

(3000, 11)

In [88]:
# Conversion of Categorical Variable to Value_Counts
# 1. for 'dep_stn'
test_data = categorical_to_value_counts(test_data,'dep_stn')
# 2. for 'arr_stn'
test_data = categorical_to_value_counts(test_data,'arr_stn')
# 3. for 'ac_code'
test_data = categorical_to_value_counts(test_data,'ac_code')
# 4. for 'dep_date'
test_data = categorical_to_value_counts(test_data,'dep_date')
# 5. for 'arr_date'
test_data = categorical_to_value_counts(test_data,'arr_date')
# 6. for 'hobbs_meter'
test_data = categorical_to_value_counts(test_data,'hobbs_meter')
# 7. for dep_country
test_data = categorical_to_value_counts(test_data,'dep_country')
# 8. for arr_country
test_data = categorical_to_value_counts(test_data,'arr_country')

In [89]:
# One hot encoding
processed_test_data = pd.get_dummies(test_data,columns=['weather','year_man'])

In [90]:
processed_test_data.isnull().sum()

id                  0
dep_stn             0
arr_stn             0
ac_code             0
dep_date            0
arr_date            0
hobbs_meter         0
dep_country         0
arr_country         0
weather_cloudy      0
weather_foggy       0
weather_rainy       0
weather_stormy      0
weather_sunny       0
weather_windy       0
year_man_1999-11    0
year_man_2000-05    0
year_man_2003-02    0
year_man_2004-08    0
year_man_2007-05    0
year_man_2008-12    0
year_man_2009-04    0
year_man_2010-01    0
year_man_2011-03    0
dtype: int64

In [91]:
df_test = processed_test_data.drop(['id'], axis=1)

In [92]:
X_test = df_test

In [93]:
X_test.shape

(3000, 23)

In [94]:
y_pred_rf_test = model_rf.predict(X_test)

In [95]:
y_pred_rf_test

array([21.08723515, 19.77436126, 19.82042427, ..., 20.07294225,
       20.08941905, 20.00818992])

In [96]:
processed_test_data['delay'] = y_pred_rf_test

In [97]:
df_final = processed_test_data

In [98]:
submissions =df_final[['id','delay']]
submissions

Unnamed: 0,id,delay
0,TU 0216,21.087235
1,TU 0543,19.774361
2,TU 0527,19.820424
3,UG 0011,21.073385
4,TU 0214,21.131056
...,...,...
2995,TU 0635,19.707816
2996,TU 0282,20.306089
2997,TU 6648,20.072942
2998,TU 0283,20.089419


#Submission
submission_df.to_csv('submissions.csv',index=False)

In [100]:
#Submission
submissions.to_csv('submissions.csv',index=False)