In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_predict

In [24]:
# load dataset
df = pd.read_csv('data/processed-data.csv')

# perform one-hot encoding for categorical columns
df = pd.get_dummies(df, columns = ['ph', 'rainfall'])

# split the dataframe into features (x) and labels (y)
x = df.drop(columns = ['label'])
y = df['label']

In [25]:
# split the data to 90-10 (90% train and test, 10% unseen)
x_seen, x_unseen, y_seen, y_unseen = train_test_split(x, y, test_size = 0.10, random_state = 42)

# split the train_test to 80-20 (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x_seen, y_seen, test_size = 0.20, random_state = 42)

In [26]:
# instantiate a gaussian naive bayes model
model = GaussianNB()

# instantiate a 10-fold cross-validation
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

# perform cross-validation predictions
y_pred_cv = cross_val_predict(model, x_train, y_train, cv = kf)

# evaluation metrics
conf_matrix = confusion_matrix(y_train, y_pred_cv)
accuracy = accuracy_score(y_train, y_pred_cv)
precision = precision_score(y_train, y_pred_cv, average = 'weighted')
recall = recall_score(y_train, y_pred_cv, average = 'weighted')

y_proba_cv = cross_val_predict(model, x_train, y_train, cv = kf, method = 'predict_proba')
roc_auc = roc_auc_score(y_train, y_proba_cv, multi_class = 'ovr')

# results for the cross validation
print('\nCross Validation Evaluation Results:')
print('Confusion Matrix: \n', conf_matrix)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')


Cross Validation Evaluation Results:
Confusion Matrix: 
 [[75  0  0  1  0  0  0  0  0  0]
 [ 2 40  0  0  0  0 26  0  0  1]
 [ 0  0 76  0  0  0  0  0  0  0]
 [ 2  0  0 56  0  0  0  0  5  0]
 [ 0  0  0  0 75  0  0  0  0  0]
 [ 0  0  0  0  0 75  0  0  0  0]
 [ 0  0  0  0  0 14 67  0  0  0]
 [ 0  0  0  0  0  0  0 74  0  0]
 [ 0  0  0 16  0  0  0  0 47  0]
 [ 0  0  0  0  0  0  0  0  0 68]]
Accuracy: 0.9069
Precision: 0.9167
Recall: 0.9069
ROC AUC: 0.9964


In [27]:
# train the model
model.fit(x_train, y_train)

# predict on the test set
y_pred_test = model.predict(x_test)

# compute metrics
conf_matrix = confusion_matrix(y_test, y_pred_test)
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average = 'weighted')
recall = recall_score(y_test, y_pred_test, average = 'weighted')
roc_auc = roc_auc_score(y_test, model.predict_proba(x_test), multi_class = 'ovr')

# results for the test set
print('\nTest Set Evaluation Results:')
print('Confusion Matrix: \n', conf_matrix)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')


Test Set Evaluation Results:
Confusion Matrix: 
 [[15  0  0  0  0  0  0  0  0  0]
 [ 1 12  0  0  0  0  9  0  0  1]
 [ 0  0 18  0  0  0  0  0  0  0]
 [ 0  0  0 24  0  0  0  0  4  0]
 [ 0  0  0  0 12  0  0  0  0  0]
 [ 0  0  0  0  0 15  0  0  0  0]
 [ 0  0  0  0  0  1 12  0  0  0]
 [ 0  0  0  0  0  0  0 13  0  0]
 [ 0  0  0  9  0  0  0  0 15  0]
 [ 0  0  0  0  0  0  0  0  0 19]]
Accuracy: 0.8611
Precision: 0.8829
Recall: 0.8611
ROC AUC: 0.9943


In [28]:
# predict on unseen data
y_pred_unseen = model.predict(x_unseen)

# compute metrics
conf_matrix = confusion_matrix(y_unseen, y_pred_unseen)
accuracy = accuracy_score(y_unseen, y_pred_unseen)
precision = precision_score(y_unseen, y_pred_unseen, average = 'weighted')
recall = recall_score(y_unseen, y_pred_unseen, average = 'weighted')
roc_auc = roc_auc_score(y_unseen, model.predict_proba(x_unseen), multi_class = 'ovr')

# results for the unseen set
print('\nUnseen Set Evaluation Results:')
print('Confusion Matrix: \n', conf_matrix)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')


Unseen Set Evaluation Results:
Confusion Matrix: 
 [[ 9  0  0  0  0  0  0  0  0  0]
 [ 0  5  0  0  0  0  3  0  0  0]
 [ 0  0  6  0  0  0  0  0  0  0]
 [ 0  0  0  8  0  0  0  0  1  0]
 [ 0  0  0  0 13  0  0  0  0  0]
 [ 0  0  0  0  0 10  0  0  0  0]
 [ 0  0  0  0  0  3  3  0  0  0]
 [ 0  0  0  0  0  0  0 13  0  0]
 [ 0  0  0  2  0  0  0  0 11  0]
 [ 0  0  0  0  0  0  0  0  0 13]]
Accuracy: 0.9100
Precision: 0.9181
Recall: 0.9100
ROC AUC: 0.9960
