In [441]:
import numpy as np
import pandas as pd

In [442]:
df = pd.read_csv('../Resources/allzipcodes_collision_weather_df.csv')
df.set_index('collision_id',inplace = True)

  df = pd.read_csv('../Resources/allzipcodes_collision_weather_df.csv')


In [443]:
df["pedestrians_accident"] = (df["number_of_pedestrians_injured"] > 0) | (df["number_of_pedestrians_killed"] > 0)
df["cyclists_accident"] = (df["number_of_cyclist_injured"] > 0) | (df["number_of_cyclist_killed"] > 0)
df["motorists_accident"] = (df["number_of_motorist_injured"] > 0) | (df["number_of_motorist_killed"] > 0)

In [444]:
pedestrian_df = df[df["pedestrians_accident"] == 1]
cyclist_df = df[df["cyclists_accident"] == 1]
motorist_df = df[df["motorists_accident"] == 1]


Build a result df to store model results

In [None]:
results_df = pd.DataFrame(np.zeros(shape=(4,6)))
results_df.index=[1,2,3,4]
results_df.columns = ["accuracy","precision","recall","f1_score","AUC","AP"]
results_df.index.rename("Model",inplace=True)
results_df

Split the dataset into train and test data

In [483]:
df.columns

Index(['crash_datetime', 'crash_day_of_week', 'crash_hour_category',
       'street_address', 'number_of_persons_injured',
       'number_of_persons_killed', 'number_of_pedestrians_injured',
       'number_of_pedestrians_killed', 'number_of_cyclist_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_motorist_killed', 'vehicle_type_code_1',
       'vehicle_type_code_2', 'borough', 'zip_code', 'latitude', 'longitude',
       'vehicle_type_code_3', 'vehicle_type_code_4', 'Density', 'Population',
       'contributing_factor', 'number_of_injuries', 'number_of_deaths',
       'injuries_rate', 'death_rate', 'severity_of_accident', 'date', 'hour',
       'tempmax', 'tempmin', 'temp', 'humidity', 'precip', 'preciptype',
       'snow', 'windspeed', 'visibility', 'severerisk', 'sunrise', 'sunset',
       'icon', 'temperature_category', 'precip_category', 'humid_category',
       'wind_level', 'vis_level', 'Density_rank', 'zip_code_count',
       'total_inf

In [570]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer


features_cat = ['crash_day_of_week', 'crash_hour_category','contributing_factor','zip_code','icon','vis_level','wind_level']
features_num = ['Population','Density']

y = df['pedestrians_accident']
y_dense = LabelBinarizer().fit_transform(y)


# Encode categorical features
label_encoder = LabelEncoder()
for feature in features_cat:
    df[feature] = label_encoder.fit_transform(df[feature])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df[features_cat] = imputer.fit_transform(df[features_cat])

# Make a column transformer object that scales (using StandardScaler) the non-categorical columns
# and one hot encodes (using OneHotEncoder) the three categorical columns
preprocess = make_column_transformer(
    (StandardScaler(),features_num, ),
    (OneHotEncoder(categories="auto",drop="first"),features_cat, )
)

#Generate the independent variable df
X = preprocess.fit_transform(df).toarray()
X = imputer.fit_transform(X)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Now we are going to perform some modeling to predict whether a pedestrian accident will occur 

Model 1 - SDGClassifier

In [495]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#Set up the hyperparameter options in param_grid
param_grid = {
    'class_weight': ["balanced",{},{0:1,1:10}], 
    'random_state':[42],
    'max_iter': [1000, 2000],
    'loss': ["log_loss"],
    "penalty": ["l1","l2"],
}

#Do the search
gs_clf = GridSearchCV(SGDClassifier(),param_grid, n_jobs = -1, verbose = 3)
gs_clf.fit(X_train, np.ravel(y_train))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 2/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.598 total time=   5.9s
[CV 3/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.595 total time=   6.4s
[CV 1/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.662 total time=   6.4s
[CV 1/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l1, random_state=42;, score=0.702 total time=  10.3s
[CV 4/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.636 total time=   5.6s
[CV 5/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l1, random_state=42;, score=0.623 total time=  12.5s
[CV 5/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.567 total time=   6.0s
[CV 4/5] END class_weight=balanced, loss=log_loss,

In [516]:
model_1 = SGDClassifier(penalty = 'l1',random_state=42, max_iter=1000,loss="log_loss", class_weight = {})
model_1.fit(X_train, np.ravel(y_train))

In [539]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.metrics import average_precision_score,roc_auc_score

y_test_pred = model_1.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_1.score(X_train,y_train)
accuracy_testing = model_1.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_1.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)

print("Confusion Matrix: \n",cfm)
print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score: ",f1)
print("AUC: ",auc)
print("Average Precision: ",ap)

Confusion Matrix: 
 [[19606   125]
 [ 1025     1]]
Training accuracy:  0.9468593797049082
Testing  accuracy:  0.9445970034205329
Precision:  0.007936507936507936
Recall:  0.0009746588693957114
F1-Score:  0.0017361111111111108
AUC:  0.6102700226427515
Average Precision:  0.0762891829237002


In [540]:
model1_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[0,:] = model1_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.944597,0.007937,0.000975,0.001736,0.61027,0.076289
2,0.884665,0.120843,0.212476,0.154064,0.625599,0.081058
3,0.888471,0.115683,0.189084,0.143544,0.633997,0.081028
4,0.894493,0.871752,0.894493,0.881544,0.561068,0.582809


Model 2 - Random Forest Classifier

In [571]:
from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier(random_state=42,n_estimators=30,max_depth=6,min_samples_leaf=500,min_samples_split=4000, class_weight={0:1,1:10})
model_2.fit(X_train,y_train)

In [572]:
feature_importances = model_2.feature_importances_
feature_names = features_cat + features_num
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Density: 0.053676115846625655
Population: 0.04538313687035923
crash_hour_category: 0.03416675077749989
crash_day_of_week: 0.011248910613544948
vis_level: 0.002872326449690214
icon: 0.0006437265081096342
wind_level: 0.0004969830598444295
zip_code: 0.00012495849317650231
contributing_factor: 6.736382949897684e-05


In [574]:
data_feature_p = pd.DataFrame(sorted_feature_importance, columns = ['features','importance'])
data_feature_p

Unnamed: 0,features,importance
0,Density,0.053676
1,Population,0.045383
2,crash_hour_category,0.034167
3,crash_day_of_week,0.011249
4,vis_level,0.002872
5,icon,0.000644
6,wind_level,0.000497
7,zip_code,0.000125
8,contributing_factor,6.7e-05


In [537]:
y_test_pred = model_2.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_2.score(X_train,y_train)
accuracy_testing = model_2.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_2.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)

print("Confusion Matrix: \n",cfm)
print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score: ",f1)
print("AUC: ",auc)
print("Average Precision: ",ap)

Confusion Matrix: 
 [[18145  1586]
 [  808   218]]
Training accuracy:  0.8855886781090033
Testing  accuracy:  0.8846654140771788
Precision:  0.12084257206208426
Recall:  0.2124756335282651
F1-Score:  0.1540636042402827
AUC:  0.6255992761511728
Average Precision:  0.08105792487069163


In [538]:
model2_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[1,:] = model2_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.884665,0.120843,0.212476,0.154064,0.625599,0.081058
3,0.888471,0.115683,0.189084,0.143544,0.633997,0.081028
4,0.894493,0.871752,0.894493,0.881544,0.561068,0.582809


Model 3 - Gradient Boosting Classifier

In [523]:
from sklearn.ensemble import GradientBoostingClassifier

sample_weight = np.array([4 if i == 1 else 1 for i in y_train])


model_3 = GradientBoostingClassifier(min_samples_split=100,
                                     max_depth=8,
                                 min_samples_leaf=100,
                                 n_estimators=400,
                                 subsample=0.6)


model_3.fit(X_train,y_train,sample_weight=sample_weight)

In [535]:
#Calculate and print metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.metrics import average_precision_score,roc_auc_score

y_test_pred = model_3.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_3.score(X_train,y_train)
accuracy_testing = model_3.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_3.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)


print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("confusion matrix:")
print(cfm)

print("precision: ",precision)
print("recall: ",recall)
print("f1 score: ",f1)
print("auc",auc)
print("ap",ap)

Training accuracy:  0.8889732008431196
Testing  accuracy:  0.8884713590595944
confusion matrix:
[[18248  1483]
 [  832   194]]
precision:  0.11568276684555755
recall:  0.18908382066276802
f1 score:  0.14354421013688493
auc 0.6339969964442809
ap 0.0810284556024716


In [536]:
model3_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[2,:] = model3_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.888471,0.115683,0.189084,0.143544,0.633997,0.081028
4,0.894493,0.871752,0.894493,0.881544,0.561068,0.582809


Model 4 - Neural Network

In [526]:
from sklearn.neural_network import MLPClassifier

model4 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=60, max_iter = 500)

In [527]:
#Fit the model
model4.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [528]:
y_train_oh = np.zeros((len(y_train),2),dtype="int")
y_train_oh[np.arange(len(y_train)), y_train.to_numpy(dtype="int")] = 1
y_test_oh = np.zeros((len(y_test),2),dtype="int")
y_test_oh[np.arange(len(y_test)), y_test.to_numpy(dtype="int")] = 1

In [541]:
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
from sklearn.metrics import roc_auc_score,average_precision_score
test_pred = model4.predict(X_test)
accuracy_training =model4.score(X_train,y_train)
accuracy_testing =model4.score(X_test,y_test)
f1 = f1_score(y_test,test_pred,average='weighted')
precision = precision_score(y_test,test_pred,average='weighted')
recall = recall_score(y_test,test_pred,average='weighted')
auc = roc_auc_score(y_test,test_pred)
test_probs = model4.predict_proba(X_test)
ap = average_precision_score(y_test_oh,test_probs)

print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("precision: ",precision)
print("recall: ",recall)
print("f1 score: ",f1)
print("auc",auc)
print("ap",ap)


Training accuracy:  0.9118458295694069
Testing  accuracy:  0.9095726742785566
precision:  0.908941830234497
recall:  0.9095726742785566
f1 score:  0.9092569110018608
auc 0.5153940381167641
ap 0.5828088096646373


In [542]:
model4_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[3,:] = model4_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.944597,0.007937,0.000975,0.001736,0.61027,0.076289
2,0.884665,0.120843,0.212476,0.154064,0.625599,0.081058
3,0.888471,0.115683,0.189084,0.143544,0.633997,0.081028
4,0.909573,0.908942,0.909573,0.909257,0.515394,0.582809


In [543]:
Ped_results = results_df.copy()
Ped_results

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.944597,0.007937,0.000975,0.001736,0.61027,0.076289
2,0.884665,0.120843,0.212476,0.154064,0.625599,0.081058
3,0.888471,0.115683,0.189084,0.143544,0.633997,0.081028
4,0.909573,0.908942,0.909573,0.909257,0.515394,0.582809


Now let's build predictive models on cyclists accidents

In [544]:
# build a results df
results_df = pd.DataFrame(np.zeros(shape=(4,6)))
results_df.index=[1,2,3,4]
results_df.columns = ["accuracy","precision","recall","f1_score","AUC","AP"]
results_df.index.rename("Model",inplace=True)
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


Split the data into train and test

In [606]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer


features_cat = ['crash_day_of_week', 'crash_hour_category','contributing_factor','zip_code','icon','vis_level','wind_level']
features_num = ['Population','Density']

y = df['cyclists_accident']
y_dense = LabelBinarizer().fit_transform(y)


# Encode categorical features
label_encoder = LabelEncoder()
for feature in features_cat:
    df[feature] = label_encoder.fit_transform(df[feature])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df[features_cat] = imputer.fit_transform(df[features_cat])

# Make a column transformer object that scales (using StandardScaler) the non-categorical columns
# and one hot encodes (using OneHotEncoder) the three categorical columns
preprocess = make_column_transformer(
    (StandardScaler(),features_num, ),
    (OneHotEncoder(categories="auto",drop="first"),features_cat, )
)

#Generate the independent variable df
X = preprocess.fit_transform(df).toarray()
X = imputer.fit_transform(X)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model 1 - SDGClassifier

In [545]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#Set up the hyperparameter options in param_grid
param_grid = {
    'class_weight': ["balanced",{},{0:1,1:10}], 
    'random_state':[42],
    'max_iter': [1000, 2000],
    'loss': ["log_loss"],
    "penalty": ["l1","l2"],
}

#Do the search
gs_clf = GridSearchCV(SGDClassifier(),param_grid, n_jobs = -1, verbose = 3)
gs_clf.fit(X_train, np.ravel(y_train))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 2/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.646 total time=   9.0s
[CV 1/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.668 total time=   9.4s
[CV 3/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.727 total time=   9.6s
[CV 4/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l2, random_state=42;, score=0.561 total time=   5.8s
[CV 1/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l1, random_state=42;, score=0.745 total time=  16.4s
[CV 2/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l1, random_state=42;, score=0.698 total time=  16.7s
[CV 5/5] END class_weight=balanced, loss=log_loss, max_iter=1000, penalty=l1, random_state=42;, score=0.637 total time=  17.5s
[CV 3/5] END class_weight=balanced, loss=log_loss,

In [549]:
gs_clf.best_score_,gs_clf.best_params_

(0.952652815417043,
 {'class_weight': {},
  'loss': 'log_loss',
  'max_iter': 1000,
  'penalty': 'l2',
  'random_state': 42})

In [550]:
model_1 = SGDClassifier(penalty = 'l1',random_state=42, max_iter=1000,loss="log_loss", class_weight = {})
model_1.fit(X_train, np.ravel(y_train))

In [560]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.metrics import average_precision_score,roc_auc_score

y_test_pred = model_1.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_1.score(X_train,y_train)
accuracy_testing = model_1.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_1.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)

print("Confusion Matrix: \n",cfm)
print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score: ",f1)
print("AUC: ",auc)
print("Average Precision: ",ap)

Confusion Matrix: 
 [[19711    20]
 [ 1001    25]]
Training accuracy:  0.9527130382414936
Testing  accuracy:  0.9508117743411861
Precision:  0.5555555555555556
Recall:  0.024366471734892786
F1-Score:  0.046685340802987856
AUC:  0.7327810760380133
Average Precision:  0.17684184485509474


In [561]:
model1_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[0,:] = model1_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.950812,0.555556,0.024366,0.046685,0.732781,0.176842
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


Model 2 - Random Forest

In [552]:
from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier(random_state=42,n_estimators=30,max_depth=6,min_samples_leaf=500,min_samples_split=4000, class_weight={0:1,1:10})
model_2.fit(X_train,y_train)

In [553]:
feature_importances = model_2.feature_importances_
feature_names = features_cat + features_num
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

Feature Importance:
crash_hour_category: 0.22976347023354687
crash_day_of_week: 0.034914440201963506
Density: 0.008720484349183214
wind_level: 0.0016411531458441755
contributing_factor: 0.0008943263021852622
vis_level: 0.0003710656709158801
icon: 0.0002512550698244199
zip_code: 0.00011013352065377203
Population: 0.0


In [562]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.metrics import average_precision_score,roc_auc_score

y_test_pred = model_2.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_2.score(X_train,y_train)
accuracy_testing = model_2.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_2.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)

print("Confusion Matrix: \n",cfm)
print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score: ",f1)
print("AUC: ",auc)
print("Average Precision: ",ap)

  _warn_prf(average, modifier, msg_start, len(result))


Confusion Matrix: 
 [[19731     0]
 [ 1026     0]]
Training accuracy:  0.952652815417043
Testing  accuracy:  0.9505708917473623
Precision:  0.0
Recall:  0.0
F1-Score:  0.0
AUC:  0.7023332980636343
Average Precision:  0.11145433198382476


In [563]:
model2_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[1,:] = model2_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.950812,0.555556,0.024366,0.046685,0.732781,0.176842
2,0.950571,0.0,0.0,0.0,0.702333,0.111454
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


Model 3 - Gradient Boosting Classifier

In [555]:
from sklearn.ensemble import GradientBoostingClassifier

sample_weight = np.array([4 if i == 1 else 1 for i in y_train])


model_3 = GradientBoostingClassifier(min_samples_split=100,
                                     max_depth=8,
                                 min_samples_leaf=100,
                                 n_estimators=400,
                                 subsample=0.6)


model_3.fit(X_train,y_train,sample_weight=sample_weight)

In [564]:
#Calculate and print metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.metrics import average_precision_score,roc_auc_score

y_test_pred = model_3.predict(X_test)
cfm = confusion_matrix(y_test,y_test_pred)

accuracy_training = model_3.score(X_train,y_train)
accuracy_testing = model_3.score(X_test,y_test)
recall = recall_score(y_test,y_test_pred)
precision = precision_score(y_test,y_test_pred) 
f1 = f1_score(y_test,y_test_pred)

prediction_probabilities_test = model_3.predict_proba(X_test)
y_score = prediction_probabilities_test[:,1]
ap = average_precision_score(y_test, y_score)
auc = roc_auc_score(y_test,y_score)


print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("confusion matrix:")
print(cfm)

print("precision: ",precision)
print("recall: ",recall)
print("f1 score: ",f1)
print("auc",auc)
print("ap",ap)

Training accuracy:  0.9464739536284251
Testing  accuracy:  0.9397793515440575
confusion matrix:
[[19350   381]
 [  869   157]]
precision:  0.29182156133828996
recall:  0.1530214424951267
f1 score:  0.20076726342710993
auc 0.7381357721391706
ap 0.163168547688434


In [565]:
model3_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[2,:] = model3_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.950812,0.555556,0.024366,0.046685,0.732781,0.176842
2,0.950571,0.0,0.0,0.0,0.702333,0.111454
3,0.939779,0.291822,0.153021,0.200767,0.738136,0.163169
4,0.0,0.0,0.0,0.0,0.0,0.0


Model 4 - Neural Network

In [605]:
from sklearn.neural_network import MLPClassifier

model4 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=60, max_iter = 500)

In [608]:
model4.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [609]:
y_train_oh = np.zeros((len(y_train),2),dtype="int")
y_train_oh[np.arange(len(y_train)), y_train.to_numpy(dtype="int")] = 1
y_test_oh = np.zeros((len(y_test),2),dtype="int")
y_test_oh[np.arange(len(y_test)), y_test.to_numpy(dtype="int")] = 1

In [610]:
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
from sklearn.metrics import roc_auc_score,average_precision_score
test_pred = model4.predict(X_test)
accuracy_training =model4.score(X_train,y_train)
accuracy_testing =model4.score(X_test,y_test)
f1 = f1_score(y_test,test_pred,average='weighted')
precision = precision_score(y_test,test_pred,average='weighted')
recall = recall_score(y_test,test_pred,average='weighted')
auc = roc_auc_score(y_test,test_pred)
test_probs = model4.predict_proba(X_test)
ap = average_precision_score(y_test_oh,test_probs)

print("Training accuracy: ",accuracy_training)
print("Testing  accuracy: ",accuracy_testing)
print("precision: ",precision)
print("recall: ",recall)
print("f1 score: ",f1)
print("auc",auc)
print("ap",ap)

Training accuracy:  0.9715386931647094
Testing  accuracy:  0.9258081611022787
precision:  0.9140377960989616
recall:  0.9258081611022787
f1 score:  0.9196882017256343
auc 0.5354836142609324
ap 0.5390408456228272


In [611]:
model4_results = [accuracy_testing, precision, recall, f1, auc, ap]
results_df.iloc[3,:] = model4_results
results_df

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.950812,0.555556,0.024366,0.046685,0.732781,0.176842
2,0.950571,0.0,0.0,0.0,0.702333,0.111454
3,0.939779,0.291822,0.153021,0.200767,0.738136,0.163169
4,0.925808,0.914038,0.925808,0.919688,0.535484,0.539041


In [612]:
results_df_c = results_df.copy()