In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_predict

In [7]:
# load dataset
#df = pd.read_csv('/data/raw-data-crop-recommendation.csv')
df = pd.read_csv('C:/Users/jerwi/Midterm-Summative-Activity/data/raw-data-crop-recommendation.csv')
# split the dataframe into features (x) and labels (y)
x = df.drop(columns = ['label'])
y = df['label']

In [9]:
# split the data to 90-10 where 90% is for training and testing while the remaining 10% is for unseen data
x_train_test, x_unseen, y_train_test, y_unseen = train_test_split(x, y, test_size = 0.10, random_state = 42)

# split the train_test to 80-20 where 80% is for training while the 20% is for testing
x_train, x_test, y_train, y_test = train_test_split(x_train_test, y_train_test, test_size = 0.20, random_state = 42)

# perform a ten-fold cross-validation
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

In [10]:
# instantiate the gaussian naive bayes model
model = GaussianNB()

# train the model
model.fit(x_train, y_train)

# predict on the test set
y_pred = model.predict(x_test)

print(y_pred)

['rice' 'maize' 'lentil' 'watermelon' 'mungbean' 'watermelon' 'pigeonpeas'
 'kidneybeans' 'pigeonpeas' 'chickpea' 'mothbeans' 'coconut' 'maize'
 'grapes' 'mungbean' 'lentil' 'mango' 'coconut' 'muskmelon' 'muskmelon'
 'lentil' 'cotton' 'mango' 'cotton' 'mango' 'orange' 'pomegranate' 'jute'
 'coffee' 'watermelon' 'pigeonpeas' 'banana' 'watermelon' 'cotton'
 'coffee' 'mango' 'orange' 'cotton' 'coffee' 'watermelon' 'banana'
 'lentil' 'jute' 'lentil' 'banana' 'pomegranate' 'grapes' 'pomegranate'
 'muskmelon' 'mango' 'mothbeans' 'coffee' 'pigeonpeas' 'coconut' 'jute'
 'rice' 'maize' 'muskmelon' 'chickpea' 'grapes' 'orange' 'papaya'
 'pigeonpeas' 'pigeonpeas' 'pomegranate' 'coffee' 'rice' 'banana'
 'watermelon' 'lentil' 'watermelon' 'apple' 'muskmelon' 'papaya' 'papaya'
 'chickpea' 'coconut' 'maize' 'pomegranate' 'maize' 'rice' 'muskmelon'
 'kidneybeans' 'chickpea' 'blackgram' 'orange' 'watermelon' 'pigeonpeas'
 'pomegranate' 'lentil' 'pigeonpeas' 'apple' 'blackgram' 'coconut'
 'pigeonpeas' '

In [13]:
#ten-fold cross validation
y_pred_cv = cross_val_predict(model, x_train, y_train, cv=kf)
print(y_pred_cv)

['grapes' 'papaya' 'grapes' ... 'cotton' 'kidneybeans' 'rice']


In [16]:
#Confusion Matrix
conf_matrix_cv = confusion_matrix(y_train, y_pred_cv)
print("Confusion Matrix:\n", conf_matrix_cv)

Confusion Matrix:
 [[72  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 71  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 73  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 71  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 72  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 69  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 74  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 74  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 72  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0 65  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 70  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0 73  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 81  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 76  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  

In [17]:
#Accuracy Metric
accuracy = accuracy_score(y_train, y_pred_cv)
print("Accuracy:", accuracy)

Accuracy: 0.9955808080808081


In [18]:
#Precision Metric
precision = precision_score(y_train, y_pred_cv, average='weighted')
print("Precision:", precision)

Precision: 0.9957186105255843


In [19]:
#Recall Metric
recall = recall_score(y_train, y_pred_cv, average='weighted')
print("Recall:", recall)

Recall: 0.9955808080808081


In [28]:
# ROC-AUC Metric

# Get probability estimates instead of class labels
y_pred_cv_proba = cross_val_predict(model, x_train, y_train, cv=kf, method="predict_proba")

# Compute ROC-AUC Score
roc_auc = roc_auc_score(y_train, y_pred_cv_proba, multi_class='ovr')
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.999961376578484
