# Imports and Extra Data Cleaning

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime
from datetime import timedelta
%load_ext line_profiler
sns.set(style="darkgrid")
import requests
import pprint
import re
%matplotlib inline

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [10]:
time1 = datetime.now()
filename = '../cleaned_flight_data_updated.csv'
random.seed(234)
p = 0.01  # p% of lines
df = pd.read_csv(
         filename,
         header=0, 
         parse_dates = ['FL_DATE','dep_datetime','arr_datetime'],
         skiprows=lambda i: i>0 and random.random() > p
)
time2 = datetime.now()
print('Time cost: ', time2-time1)

  interactivity=interactivity, compiler=compiler, result=result)


Time cost:  0:01:59.603237


In [11]:
# if DEP_DELAY is NaN, drop because useless for our model
df = df.dropna(subset=['DEP_DELAY'])
df['hour'] =  pd.Series.dt.hour

## Visibility/Wind Speed Cleaning and Imputing NaN values

In [12]:
# visibility column cleaning
time1 = datetime.now()

import math
def spacetonan(x):
    if isinstance(x, str) and x != '':
        return float(x)
    elif x == '':
        return np.nan
    return x

# remove 'V', 'Vs', 's', etc.
df['HOURLYVISIBILITY_origin'] = df['HOURLYVISIBILITY_origin'].str.replace('[^\d(.)]', "")
df['HOURLYVISIBILITY_dest'] = df['HOURLYVISIBILITY_dest'].str.replace('[^\d(.)]', "")
df['HOURLYWindSpeed_origin'] = df['HOURLYWindSpeed_origin'].str.replace('[^\d(.)]', "")
df['HOURLYWindSpeed_dest'] = df['HOURLYWindSpeed_dest'].str.replace('[^\d(.)]', "")
df['HOURLYPrecip_origin'] = df['HOURLYPrecip_origin'].str.replace('[^\d(.)]', "")
df['HOURLYPrecip_dest'] = df['HOURLYPrecip_dest'].str.replace('[^\d(.)]', "")

# for the entries with space, fill with nan
df['HOURLYVISIBILITY_origin'] = list(map(spacetonan, df['HOURLYVISIBILITY_origin']))
df['HOURLYVISIBILITY_dest'] = list(map(spacetonan, df['HOURLYVISIBILITY_dest']))
df['HOURLYWindSpeed_origin'] = list(map(spacetonan, df['HOURLYWindSpeed_origin']))
df['HOURLYWindSpeed_dest'] = list(map(spacetonan, df['HOURLYWindSpeed_dest']))
df['HOURLYPrecip_origin'] = list(map(spacetonan, df['HOURLYPrecip_origin']))
df['HOURLYPrecip_dest'] = list(map(spacetonan, df['HOURLYPrecip_dest']))
# impute the nan values to mean (~9.4 mi visibility)
np.set_printoptions(threshold=np.nan)
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df['HOURLYVISIBILITY_origin_imp'] = imp_mean.fit_transform(df[['HOURLYVISIBILITY_origin']]).ravel()
df['HOURLYVISIBILITY_dest_imp'] = imp_mean.fit_transform(df[['HOURLYVISIBILITY_dest']]).ravel()
df['HOURLYWindSpeed_origin_imp'] = imp_mean.fit_transform(df[['HOURLYWindSpeed_origin']]).ravel()
df['HOURLYWindSpeed_dest_imp'] = imp_mean.fit_transform(df[['HOURLYWindSpeed_dest']]).ravel()
df['HOURLYPrecip_origin'] = imp_mean.fit_transform(df[['HOURLYPrecip_origin']]).ravel()
df['HOURLYPrecip_dest'] = imp_mean.fit_transform(df[['HOURLYPrecip_dest']]).ravel()
time2 = datetime.now()
print('Time cost: ', time2-time1)

Time cost:  0:00:06.646230


## Parsing "present weather type" and adding column with most salient weather code

In [13]:
# given an entry in the HOURLYPRSENTWEATHERTYPE, give AW xx value
# otherwise return 0
def regex_weather_cond(h_pres_w):
    pattern = re.compile(r'\d{2}')
    if isinstance(h_pres_w, str):
        if re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', h_pres_w):
            string = re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', h_pres_w).group(0)
            match = pattern.findall(string)
        else:
            return 0
    else:
        return 0
    return most_severe_cond(match)

# hard code severe weather condition
# based mostly on avg delay min per weather type
# smoke from Oct 2017 CA forestfire is high cancelled but not delay
def most_severe_cond(lst):
    # freezing rain
    if '64' in lst:
        return 64
    elif '65' in lst:
        return 65
    elif '66' in lst:
        return 66
    # ice pellets
    elif '74' in lst:
        return 74
    elif '75' in lst:
        return 75
    elif '76' in lst:
        return 76
    # thunderstorm with hailen
    elif '93' in lst:
        return 93
    # snow
    elif '71' in lst:
        return 71
    elif '72' in lst:
        return 72
    elif '73' in lst:
        return 73
    # fog
    elif '30' in lst:
        return 30
    elif '31' in lst:
        return 31
    elif '32' in lst:
        return 32
    elif '33' in lst:
        return 33
    elif '34' in lst:
        return 34
    # safety return 0 if somehow pass empty list
    elif len(lst) == 0:
        return 0
    # return max element as int from list of strings
    # after those preceding cases, most severe is higher code
    # also not too many cases of combination of drastically different codes 
    else:
        return max(list(map(int, lst)))
    

# hard code severe weather condition
# based mostly on avg delay min per weather type
# smoke from Oct 2017 CA forestfire is high cancelled but not delay
def ordinal_ordering(x):
    # freezing rain
    if x == 64:
        return 115
    elif x == 65:
        return 116
    elif x == 66:
        return 117
    # ice pellets
    elif x == 74:
        return 112
    elif x == 75:
        return 113
    elif x == 76:
        return 114
    # thunderstorm with hail
    elif x == 93:
        return 113
    # snow
    elif x == 71:
        return 110
    elif x == 72:
        return 111
    elif x == 73:
        return 112
    # fog
    elif x == 30:
        return 105
    elif x == 31:
        return 106
    elif x == 32:
        return 107
    elif x == 33:
        return 108
    elif x == 34:
        return 109
    else:
        return x

In [14]:
# make the present weather type columns into new col
# with most severe weather code selected as integer
time1 = datetime.now()
df['origin_sev_code'] = df['HOURLYPRSENTWEATHERTYPE_origin'].apply(
    lambda x: regex_weather_cond(x))
df['dest_sev_code'] = df['HOURLYPRSENTWEATHERTYPE_dest'].apply(
    lambda x: regex_weather_cond(x))

df['origin_enc_code'] = df['origin_sev_code'].apply(
    lambda x: ordinal_ordering(x))
df['dest_enc_code'] = df['dest_sev_code'].apply(
    lambda x: ordinal_ordering(x))
time2 = datetime.now()
print('Time cost: ', time2-time1)

Time cost:  0:00:01.977712


In [15]:
print(df.origin_sev_code.unique())
print(df.dest_sev_code.unique())
print(df.origin_enc_code.unique())
print(df.dest_enc_code.unique())

[ 0 91 71 61 62  5 51 31 72 35 92 63 73 90 30 67 95 64 74 81 54 93 33 75
 82 52 65 99  4 18 68]
[ 0 92 71 61 31 51 91 62  5 90 72 54 30 63 74 73 35 67 95 89 18 64 52 68
 65 75 33 93 81  4]
[  0  91 110  61  62   5  51 106 111  35  92  63 112  90 105  67  95 115
  81  54 113 108  82  52 116  99   4  18  68]
[  0  92 110  61 106  51  91  62   5  90 111  54 105  63 112  35  67  95
  89  18 115  52  68 116 113 108  81   4]


# Binary Classification

Using "DEP_DEL15":
- 1 means delay >= 15
- 0 means delay < 15 (on-time / early)

Goal is to predict this classification. We will use **F1-score** as the primary evaluation metric, since we are concerned with both classes of classificacation: on-time/early vs. delayed. Since the classes are unbalanced, metrics such as accuracy will give a baseline model where we classify all flights as on-time very well (which is misleading since that would not be a useful model). 

Let's start with a **random forest** weather model, where we predict the DEP_DEL15 using the features:
- weather code - origin
- visibilty - origin
- wind speed at origin
- precipitation at origin
- (possibly temperature)

After weather, we will use flight/airport data:
- number of seats
- departure airport (*need encoding/merging - use passenger count*)
- carrier (*need ordinal encoding*)
- departure time (DEP_TIME)
- duration of flight

Later on if we have time:
- cloud cover (need complicated regex)
- time dependence/delay
- other things

http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#sphx-glr-auto-examples-ensemble-plot-feature-transformation-py

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/drf.html

https://en.wikipedia.org/wiki/Receiver_operating_characteristic

http://scikit-learn.org/stable/modules/model_evaluation.html

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [22]:
# import all these lol
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

In [71]:
list(df)

['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'OP_UNIQUE_CARRIER',
 'OP_CARRIER_AIRLINE_ID',
 'OP_CARRIER',
 'TAIL_NUM',
 'OP_CARRIER_FL_NUM',
 'ORIGIN_AIRPORT_ID',
 'ORIGIN_AIRPORT_SEQ_ID',
 'ORIGIN_CITY_MARKET_ID',
 'ORIGIN',
 'ORIGIN_CITY_NAME',
 'ORIGIN_STATE_ABR',
 'ORIGIN_STATE_FIPS',
 'ORIGIN_STATE_NM',
 'ORIGIN_WAC',
 'DEST_AIRPORT_ID',
 'DEST_AIRPORT_SEQ_ID',
 'DEST_CITY_MARKET_ID',
 'DEST',
 'DEST_CITY_NAME',
 'DEST_STATE_ABR',
 'DEST_STATE_FIPS',
 'DEST_STATE_NM',
 'DEST_WAC',
 'CRS_DEP_TIME',
 'DEP_TIME',
 'DEP_DELAY',
 'DEP_DELAY_NEW',
 'DEP_DEL15',
 'DEP_DELAY_GROUP',
 'DEP_TIME_BLK',
 'TAXI_OUT',
 'WHEELS_OFF',
 'WHEELS_ON',
 'TAXI_IN',
 'CRS_ARR_TIME',
 'ARR_TIME',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'ARR_DEL15',
 'ARR_DELAY_GROUP',
 'ARR_TIME_BLK',
 'CANCELLED',
 'CANCELLATION_CODE',
 'DIVERTED',
 'CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'FLIGHTS',
 'DISTANCE',
 'DISTANCE_GROUP',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DEL

In [112]:
X = df[['origin_enc_code','DEP_TIME','number_of_seats','origin_WBAN','dest_WBAN','HOURLYPrecip_origin','HOURLYPrecip_dest']]
y = df['DEP_DEL15']

#, ratio = 1.0)
# test_size = .1,
sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(hidden_layer_sizes=(100,20,4),max_iter=500)
mlp.fit(X_train,y_train_res.ravel())
predictions = mlp.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("accuracy score:", accuracy_score(y_test, predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))

  


[[18839  4040]
 [ 3066  1789]]
              precision    recall  f1-score   support

         0.0       0.86      0.82      0.84     22879
         1.0       0.31      0.37      0.33      4855

   micro avg       0.74      0.74      0.74     27734
   macro avg       0.58      0.60      0.59     27734
weighted avg       0.76      0.74      0.75     27734

accuracy score: 0.743780197591404
matthews corr coef:  0.17899078084768064


In [110]:
X = df[['number_of_seats','DEP_TIME','number_of_seats','origin_WBAN','dest_WBAN','HOURLYPrecip_origin','HOURLYPrecip_dest']]
y = df['DEP_DEL15']

#, ratio = 1.0)
# test_size = .1,
sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100),max_iter=100)
mlp.fit(X_train,y_train_res.ravel())
predictions = mlp.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("accuracy score:", accuracy_score(y_test, predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))

  


[[18837  4011]
 [ 3153  1733]]
              precision    recall  f1-score   support

         0.0       0.86      0.82      0.84     22848
         1.0       0.30      0.35      0.33      4886

   micro avg       0.74      0.74      0.74     27734
   macro avg       0.58      0.59      0.58     27734
weighted avg       0.76      0.74      0.75     27734

accuracy score: 0.7416889017090935
matthews corr coef:  0.16840787241787725


In [None]:
X = df[['origin_enc_code','DEP_TIME','number_of_seats','origin_passenger_count','dest_passenger_count','OP_CARRIER_AIRLINE_ID'
        ,'HOURLYPrecip_origin','DISTANCE']]
y = df['DEP_DEL15']

sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=10)
parameter_space = {
    'hidden_layer_sizes': [(5000,1000,300,100,20,5)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd'],
    'alpha': [ 0.01],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train_res.ravel())
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
predictions = clf.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("accuracy score:", accuracy_score(y_test, predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))

  del sys.path[0]


In [119]:
X = df[['origin_enc_code','DEP_TIME','number_of_seats','origin_passenger_count','dest_passenger_count','OP_CARRIER_AIRLINE_ID'
        ,'HOURLYPrecip_origin','DISTANCE']]
y = df['DEP_DEL15']

sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=500)
parameter_space = {
    'hidden_layer_sizes': [(20,3)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd'],
    'alpha': [ 0.01],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train_res.ravel())
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
predictions = clf.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("accuracy score:", accuracy_score(y_test, predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))

  del sys.path[0]


Best parameters found:
 {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (20, 3), 'learning_rate': 'constant', 'solver': 'sgd'}
0.660 (+/-0.016) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (20, 3), 'learning_rate': 'constant', 'solver': 'sgd'}
0.660 (+/-0.017) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (20, 3), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.666 (+/-0.028) for {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (20, 3), 'learning_rate': 'constant', 'solver': 'sgd'}
0.663 (+/-0.026) for {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (20, 3), 'learning_rate': 'adaptive', 'solver': 'sgd'}
[[14187  8632]
 [ 1732  3183]]
              precision    recall  f1-score   support

         0.0       0.89      0.62      0.73     22819
         1.0       0.27      0.65      0.38      4915

   micro avg       0.63      0.63      0.63     27734
   macro avg       0.58      0.63      0.56     27734
weighted avg  

In [115]:
X = df[['origin_enc_code','DEP_TIME','number_of_seats','origin_WBAN','dest_WBAN','HOURLYPrecip_origin','HOURLYPrecip_dest']]
y = df['DEP_DEL15']

sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [ (50,50,50), (100,50,50), (100), (100,20,4), (20,5)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [ 0.01, 0.1],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train_res.ravel())
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
predictions = clf.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("accuracy score:", accuracy_score(y_test, predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))

  if sys.path[0] == '':


Best parameters found:
 {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50, 10), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.640 (+/-0.020) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.716 (+/-0.098) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.639 (+/-0.024) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.709 (+/-0.093) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.643 (+/-0.018) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50, 10), 'learning_rate': 'constant', 'solver': 'sgd'}
0.707 (+/-0.091) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50, 10), 'learning_rate': 'constant', 'solver': 'adam'}
0.6

matthews corr coef:  0.18969155618293748


In [36]:
X = df[['origin_enc_code','DEP_TIME','number_of_seats','origin_passenger_count','dest_passenger_count','OP_CARRIER_AIRLINE_ID'
        ,'HOURLYPrecip_origin','DISTANCE']]
y = df['DEP_DEL15']
sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(x_train_res)

# Now apply the transformations to the data:
X_train = scaler.transform(x_train_res)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(hidden_layer_sizes=(40,20,10,5,5,2),max_iter=100)
mlp.fit(X_train,y_train_res.ravel())
predictions = mlp.predict(X_test)  
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("matthews corr coef: ", matthews_corrcoef(y_test, predictions))
print("roc auc score:", roc_auc_score(y_test, predictions))

  if sys.path[0] == '':


[[15804  7011]
 [ 2112  2807]]
              precision    recall  f1-score   support

         0.0       0.88      0.69      0.78     22815
         1.0       0.29      0.57      0.38      4919

   micro avg       0.67      0.67      0.67     27734
   macro avg       0.58      0.63      0.58     27734
weighted avg       0.78      0.67      0.71     27734

matthews corr coef:  0.2103508825488134
roc auc score: 0.6316733047760305


