In [1]:
# IMPORTS
import io
from datetime import datetime

#DATA WRANGLERS
import pandas as pd 
import numpy as np

# ML REQUIRMENTS
from sklearn.multioutput import MultiOutputClassifier # to predict for mulitple features at once
from sklearn.model_selection import train_test_split # 80/20 split with shuffle 
from sklearn.pipeline import make_pipeline # link normalizing method with classifier 
from sklearn.preprocessing import StandardScaler # normalizing method 

#CLASSIFIERS
from sklearn.naive_bayes import GaussianNB # applied 
from sklearn.linear_model import LogisticRegression # applied 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # applied 
from sklearn.neighbors import KNeighborsClassifier # applied 
from sklearn.tree import DecisionTreeClassifier # applied 
from sklearn.svm import SVC # applied 
from sklearn.ensemble import RandomForestClassifier # applied
#ANALYTICS
import time
from sklearn.metrics import accuracy_score 
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
#IMPORT FILE :  
from google.colab import files
uploaded = files.upload() 

Saving ML Data.xlsx - Formatted Data.csv to ML Data.xlsx - Formatted Data.csv


In [3]:
# this must be a csv of the same name of the import file after downloading 
data = pd.read_csv(io.BytesIO(uploaded['ML Data.xlsx - Formatted Data.csv'])) 

# Training Data 
The training dataframe has three main segments, intervention descriptions, features and DID results. 

**Intervention description** includes date of implimentation in YYYY-MM-DD and a paragraph of the intervention. 

**The features** the serve as the parameters, which are used to guess the labels, the DID results.

**The DID results** were determined for all historical interventions obtained, they represent if the specific type of travel were impacted significantly. Done for Domestic, International, and Transborder. 



In [4]:
df = pd.DataFrame (data) # create new df for manipulation to ensure original data integrity
cols_df = df.columns # store original columns as these will be used for final (oppertunity to verify and append long term)
df.head() #(184, 7)

Unnamed: 0,Date,Description,Intervention type,Direct Flight Reduction,new_cases,new_deaths,total_deaths,Domestic Significant,International Significant,Transborder Significant
0,2020-01-29,Travel advisory warning against non-essential ...,Restrictions,1,0,0,0,0,1,0
1,2020-02-29,Travel advisory warning against all travel to ...,Restrictions,1,5,0,0,0,0,0
2,2020-03-02,Travel advisory warning against all non-essent...,Restrictions,1,7,0,0,0,0,0
3,2020-03-06,Travel advisory warning against all travel to ...,Restrictions,1,19,0,0,0,0,0
4,2020-03-09,Travel advisory warning against all cruise shi...,Restrictions,0,24,1,1,1,1,1


# Data Preperation
The data utilized in this model was developed through the groups analysis. The group ensures that:
- there are no categorical columns
- there are no NaN values 
The only required cleaning is ensureing that we use recognizable dates. 

In [5]:
#CLEAN 

df["Date"] = pd.to_datetime(df["Date"])

interv_type_vals = list(df["Intervention type"].unique())
interv_type_vals_no = range(0, len(interv_type_vals)) 

## flight type numerical to categorical legend
interv_type_legend = set(zip(interv_type_vals_no, interv_type_vals))

# convert categorical to numerical manually 
df['Intervention type'].replace(interv_type_vals, interv_type_vals_no, inplace=True)
df['Intervention type'] = df['Intervention type'].astype('category').cat.codes

# missing values
#list with all columns with nan
nan_colm = df.columns[df.isna().any()].tolist() 
# if there are NaN columns remove entire row
if len(nan_colm) > 0 :
  df = df.dropna()
else: pass

In [6]:
df

Unnamed: 0,Date,Description,Intervention type,Direct Flight Reduction,new_cases,new_deaths,total_deaths,Domestic Significant,International Significant,Transborder Significant
0,2020-01-29,Travel advisory warning against non-essential ...,0,1,0,0,0,0,1,0
1,2020-02-29,Travel advisory warning against all travel to ...,0,1,5,0,0,0,0,0
2,2020-03-02,Travel advisory warning against all non-essent...,0,1,7,0,0,0,0,0
3,2020-03-06,Travel advisory warning against all travel to ...,0,1,19,0,0,0,0,0
4,2020-03-09,Travel advisory warning against all cruise shi...,0,0,24,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
179,2022-04-01,Fully vaccinated travellers no longer required...,0,1,12470,69,37666,0,1,1
180,2022-04-05,New recommendations announced for a 4th dose (...,14,0,8904,43,37772,0,1,1
181,2022-04-14,Use of Evusheld COVID-19 vaccine authorized fo...,19,0,18945,130,38289,1,0,0
182,2022-04-21,$379 million will be invested into long-term c...,7,0,10515,121,38654,0,0,1


## Bootstrapping
Bootstrapping is a technique which increases sample size used, this is necessary because the sample size is relativley small. This will increase the generality of the models developed to **reduce overfitting**. However due to the scarcity of data regarding interventions, the model is not broad enough to advise on all possible solution types. This model is intented to be updated as new interventions are implimented and observed, to grow the knowledge base and increase accuracy of the model.  

In [7]:
# boot strapping 
df_boot = df.sample(frac = 100, replace = True) # 100 times more observations
df_boot.shape

(18400, 10)

In [8]:
# SEPERATE TRAINING LABELS AND FEATURES
# features
X = pd.DataFrame({
    "Intervention type" : df_boot["Intervention type"], 
    "Direct Flight Reduction " : df_boot["Direct Flight Reduction "],
    "new_cases" : df_boot["new_cases"],
    "new_deaths" : df_boot["new_deaths"],
    "total_deaths" : df_boot["total_deaths"]
})
# labels
y = df_boot.drop(['Date', 'Description', 'Intervention type', 'Direct Flight Reduction ', 'new_cases', 'new_deaths', 'total_deaths'], axis = 1)

# SPLIT TRAINING AND TESTING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5406, shuffle= True)

# Classifier Selection 
The model can use any classifier. Here f1 scores are micro-averaged and used as the primary decision factor. Micro-averages are used because this is a multi-label classifier and all labells are equally important. 

The following classifers have been evaluated for performance:
 - GaussianNB
 - Logistic Reg	
 - KNNeighbors Classifier	
 - Decision Tree	
 - SVC
 - Random Forest



## Evaluation of Classifiers
A table is generated storing KPIs for all classifier after exicution. 

In [9]:
#function to evaluate classifiers and generate readonly classification_report
def evaluate_classifier(test_labels, prediction):
  acc = accuracy_score(test_labels, prediction)
  recall = recall_score(test_labels, prediction, average = 'micro')
  prec = precision_score(test_labels, prediction, average = 'micro')
  f1 = f1_score(test_labels, prediction, average = "micro")
  print(classification_report(test_labels, prediction))
  print(f'Accuracy = {round(acc, 4)} \nRecall = {round(recall,4)}\nPrecision = {round(prec,4)}\nF1-Score = {round(f1,4)}')
  return acc, recall, prec, f1

In [10]:
#function to append to df
def add_to_df(df,class_name, ac, re, per, f1):
  df.loc[len(df.index)] = [class_name, ac, re, per,f1]

# data_frame for classifier performance 
df_eval_classifiers = pd.DataFrame(columns = ['Classifier', 'Accuracy', 'Recall', 'Precision', "F1-score"])
#df_eval_classifiers

In [11]:
# Naiive Bayers pipeline
model_gnb = make_pipeline(StandardScaler(), MultiOutputClassifier(GaussianNB()))
# fit model 
model_gnb.fit(X_train, y_train)
y_pred = model_gnb.predict(X_test)
#eval
gnb = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "GaussianNB", gnb[0], gnb[1], gnb[2], gnb[3])

              precision    recall  f1-score   support

           0       0.65      0.49      0.56      2136
           1       0.61      0.53      0.57      1977
           2       0.68      0.33      0.45      1954

   micro avg       0.64      0.45      0.53      6067
   macro avg       0.65      0.45      0.52      6067
weighted avg       0.65      0.45      0.52      6067
 samples avg       0.35      0.37      0.34      6067

Accuracy = 0.1864 
Recall = 0.4518
Precision = 0.6401
F1-Score = 0.5297


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Logistic Regression pipeline
model_lr = make_pipeline(StandardScaler(), MultiOutputClassifier(LogisticRegression()))
# fit model 
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
#eval
lr = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "Logistic Reg", lr[0], lr[1], lr[2], lr[3])

              precision    recall  f1-score   support

           0       0.62      0.83      0.71      2136
           1       0.58      0.64      0.61      1977
           2       0.59      0.69      0.63      1954

   micro avg       0.60      0.72      0.65      6067
   macro avg       0.59      0.72      0.65      6067
weighted avg       0.59      0.72      0.65      6067
 samples avg       0.54      0.62      0.54      6067

Accuracy = 0.2141 
Recall = 0.7239
Precision = 0.5957
F1-Score = 0.6536


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# K Neighbors pipeline
model_KN = make_pipeline(StandardScaler(), MultiOutputClassifier(KNeighborsClassifier()))
# fit model 
model_KN.fit(X_train, y_train)
y_pred = model_KN.predict(X_test)
#eval
kn = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "KNeighbors Classifier", kn[0], kn[1], kn[2], kn[3])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2136
           1       1.00      1.00      1.00      1977
           2       1.00      1.00      1.00      1954

   micro avg       1.00      1.00      1.00      6067
   macro avg       1.00      1.00      1.00      6067
weighted avg       1.00      1.00      1.00      6067
 samples avg       0.85      0.85      0.85      6067

Accuracy = 1.0 
Recall = 1.0
Precision = 1.0
F1-Score = 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
t0 = time.time()
# Decision Tree pipeline
model_td = make_pipeline(StandardScaler(), MultiOutputClassifier(DecisionTreeClassifier()))
# fit model 
model_td.fit(X_train, y_train)
y_pred = model_td.predict(X_test)
#eval
td = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "Decision Tree", td[0], td[1], td[2], td[3])
t1 = time.time()
dt_total = t1-t0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2136
           1       1.00      1.00      1.00      1977
           2       1.00      1.00      1.00      1954

   micro avg       1.00      1.00      1.00      6067
   macro avg       1.00      1.00      1.00      6067
weighted avg       1.00      1.00      1.00      6067
 samples avg       0.85      0.85      0.85      6067

Accuracy = 1.0 
Recall = 1.0
Precision = 1.0
F1-Score = 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print(f'The Decision Tree algorithm takes {round(dt_total,2)}s to complete')

The Decision Tree algorithm takes 0.47s to complete


In [16]:
# SVC pipeline
model_svc = make_pipeline(StandardScaler(), MultiOutputClassifier(SVC()))
# fit model 
model_svc.fit(X_train, y_train)
y_pred = model_svc.predict(X_test)
#eval
svc = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "SVC", svc[0], svc[1], svc[2], svc[3])

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      2136
           1       0.81      0.84      0.82      1977
           2       0.86      0.82      0.84      1954

   micro avg       0.82      0.85      0.84      6067
   macro avg       0.82      0.85      0.84      6067
weighted avg       0.82      0.85      0.84      6067
 samples avg       0.70      0.71      0.69      6067

Accuracy = 0.6269 
Recall = 0.8493
Precision = 0.8219
F1-Score = 0.8354


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
t0 = time.time()
# Random Forest pipeline
model_rf = make_pipeline(StandardScaler(), MultiOutputClassifier(RandomForestClassifier()))
# fit model 
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
#eval
rf = evaluate_classifier(y_test, y_pred)
#store to df
add_to_df(df_eval_classifiers, "Random Forest", rf[0], rf[1], rf[2], rf[3])
t1 = time.time()
rf_total = t1-t0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2136
           1       1.00      1.00      1.00      1977
           2       1.00      1.00      1.00      1954

   micro avg       1.00      1.00      1.00      6067
   macro avg       1.00      1.00      1.00      6067
weighted avg       1.00      1.00      1.00      6067
 samples avg       0.85      0.85      0.85      6067

Accuracy = 1.0 
Recall = 1.0
Precision = 1.0
F1-Score = 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(f'The Decision Tree algorithm takes {round(rf_total,2)}s to complete')

The Decision Tree algorithm takes 2.11s to complete


In [19]:
df_eval_classifiers

Unnamed: 0,Classifier,Accuracy,Recall,Precision,F1-score
0,GaussianNB,0.186413,0.451788,0.640121,0.529713
1,Logistic Reg,0.21413,0.723916,0.595687,0.653571
2,KNeighbors Classifier,1.0,1.0,1.0,1.0
3,Decision Tree,1.0,1.0,1.0,1.0
4,SVC,0.626902,0.849349,0.82185,0.835373
5,Random Forest,1.0,1.0,1.0,1.0


# Final Prediction Model 
Below is the function where Canadian Govornment agents will input intervention data to predict which flights are likley to be impacted. 



In [20]:
# function to ask question and deal with incorrect input
def ask_q(question, possible_ans, data_type):
  para = data_type(input(question))
  #handle incorrect inputs 
  if not para in possible_ans:
    print(f'Please try again, possible answers are: \n {possible_ans}.')
    return ask_q(question, possible_ans,data_type)
  else:
    return para

In [21]:
# function to convert Y/N to 1/0
def yn_to_bool(para):
  possible_ans = ['Y', 'y', 'Yes', 'yes','N', 'n', 'No', 'no']
  if not para in possible_ans:
    print(f'ValueError: Acceptable answers are yes and no')
    return None
  #bool conversion
  elif para in possible_ans[0:3]:
    para = 1
  elif para in possible_ans[4:]:
    para = 0
  return para

In [22]:
# function to output intervention/potential solution 
def simulate_intervention(in_date,in_desc,in_type,in_red_flights, in_ncase, in_ndeath, in_totdeath):
  # convert string date to date type 
  use_date = datetime.strptime(in_date, "%Y-%m-%d")

  # INDEXING DF
  df_id = pd.DataFrame({
          cols_df[0] : [use_date],
          cols_df[1] :[in_desc]
      })
  # DECISION FEATURES DF
  X = pd.DataFrame({
      "Intervention type" :  [in_type], 
      "Direct Flight Reduction " :  [in_red_flights], 
      "new_cases" : in_ncase,
      "new_deaths" : in_ndeath,
      "total_deaths" : in_totdeath
  })
  # call classifier(top performer is hardcoded)
  new_intervention = model_rf.predict(X)

  # PREDICTION DF 
  Y = pd.DataFrame(columns = cols_df[7:], data = new_intervention)

  # Output Dialog 
  print(f'\n {in_desc} will likley impact the following types of travel:')
  #print flight type if dataframe shows true 
  print(f'\t - Domestic')       if Y.iloc[0,-3] == 1 else None
  print(f'\t - International')  if Y.iloc[0,-2] == 1 else None
  print(f'\t - Transborder')    if Y.iloc[0,-1] == 1 else None 
  print(f'\nThe intervention line item is shown below.')

  #return observation row in same format as training dataset 
  return df_id.join(X).join(Y)

# User Interface
After seccessful exicution of all the above code, the following cell will run and prompt the user to input information regarding the potential intervention. 

The output will describe which types of travel i.e., international, domestic, and transborder, are likley to be significantly impacted with a confidence interval of 90%.  

In [None]:
#RUN THIS CELL FOR USER INTERFACE

# promt for date, title, information for decriptors 
## covid statistics at time of intervention implimentation
integers = range(0,1000000)
in_ncase       = ask_q(f'What are the new daily cases at time of implimentation?\t\t--> ', integers, int)
in_ndeath      = ask_q(f'What are the new daily deaths at time of implimentation?\t--> ', integers, int)
in_totdeath    = ask_q(f'What are the total deaths at time of implimentation?\t\t--> ', integers, int)

## intervention date and description
in_desc        = input(f'Name/Descraiption of intervention?\t\t\t\t--> ')
in_date        = input(f'What date will it be implemented? (YYYY-MM-DD)\t\t\t--> ')

#intervention features
print('================================================================')
print('INTERVENTION TYPE\n')
print(*interv_type_legend, sep = '\n')
print('================================================================')

in_type        = ask_q(f'What type of intervention is it?\t\t\t\t--> ', list(interv_type_vals_no), int)
in_red_flights = yn_to_bool(ask_q('Does it directly limit travel?(Y/N)\t\t\t\t--> ', ['Y', 'y', 'Yes', 'yes','1','N', 'n', 'No', 'no','2'], str))

# call machine learning model  
interv_1 = simulate_intervention(in_date,in_desc,in_type,in_red_flights, in_ncase, in_ndeath, in_totdeath)
interv_1