In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('iris.csv')

In [5]:
df

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
df.columns =['sepal_length', 'sepal_width','petal_length','petal_width','flower_class']
df.loc[len(df.index)]=[5.1,3.5,1.4,0.2,'Iris-setosa'] #Since 1st row was replaced by column names.
new_df = pd.DataFrame(df)
new_df['flower_class'] = df.flower_class.astype('category')
new_df['flower_class']= new_df['flower_class'].cat.codes

In [7]:
new_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,flower_class
0,4.9,3.0,1.4,0.2,0
1,4.7,3.2,1.3,0.2,0
2,4.6,3.1,1.5,0.2,0
3,5.0,3.6,1.4,0.2,0
4,5.4,3.9,1.7,0.4,0
...,...,...,...,...,...
145,6.3,2.5,5.0,1.9,2
146,6.5,3.0,5.2,2.0,2
147,6.2,3.4,5.4,2.3,2
148,5.9,3.0,5.1,1.8,2


In [8]:
features = new_df.drop(columns=['flower_class']).values
target = new_df['flower_class'].values

In [9]:
print(target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 0]


In [10]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(features,target, test_size=.2, )

In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
pred = gnb.predict(x_test)

In [13]:
from sklearn.metrics import accuracy_score

acc  = accuracy_score(pred,y_test)
print(f"Accuracy : {acc*100}")

Accuracy : 93.33333333333333


In [14]:
from sklearn.metrics import precision_score, confusion_matrix,recall_score,f1_score

cm = confusion_matrix(y_test, pred)
precision = precision_score(y_test, pred,average='macro')
recall = recall_score(y_test, pred,average='macro')
f1 = f1_score(y_test,pred,average='macro')

In [15]:
print(cm)
print(precision)
print(recall)
print(f1)

[[12  0  0]
 [ 0  7  1]
 [ 0  1  9]]
0.9249999999999999
0.9249999999999999
0.9249999999999999


In [16]:
from sklearn.metrics import roc_curve

In [17]:
proba = gnb.predict_proba(x_test)

In [18]:
setosa, virginica, versicolor = new_df.groupby('flower_class')


In [19]:
ytrue_arr= np.array(y_test).reshape(-1,1)
prob_arr= np.array(proba).reshape(-1,1)

In [20]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,flower_class
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica


In [21]:
X,y  = np.array( df.drop(columns=['flower_class'])), np.array(df['flower_class'])

In [22]:
Xtrain,Xtest, Ytrain,Ytest = train_test_split(X,y, test_size=0.2)

In [23]:
classifier = GaussianNB()
y_score = classifier.fit(Xtrain,Ytrain).predict_proba(Xtest)

In [24]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer().fit(Ytrain)
y_onehot_test = lb.transform(Ytest)


In [25]:
y_onehot_test.shape

(30, 3)

In [26]:
lb.transform(["Iris-versicolor"])

array([[0, 1, 0]])

In [27]:
class_of_interest = "Iris-virginica"
class_id = np.flatnonzero(lb.classes_ == class_of_interest)[0]
class_id

2

In [28]:
y_score

array([[0.00000000e+000, 1.00381559e-013, 1.00000000e+000],
       [3.22266998e-124, 9.87023317e-001, 1.29766832e-002],
       [2.33875304e-288, 3.21734308e-012, 1.00000000e+000],
       [1.02346209e-138, 2.03112478e-001, 7.96887522e-001],
       [1.74900418e-083, 9.99901800e-001, 9.82002924e-005],
       [1.74554135e-095, 9.98306294e-001, 1.69370555e-003],
       [3.42196771e-121, 9.27417687e-001, 7.25823132e-002],
       [3.97921724e-067, 9.99997879e-001, 2.12119007e-006],
       [2.91626634e-130, 3.85153225e-001, 6.14846775e-001],
       [1.00000000e+000, 9.53843779e-015, 6.91185408e-023],
       [1.00000000e+000, 1.30056011e-017, 1.51729195e-027],
       [1.71679555e-232, 1.12253047e-006, 9.99998877e-001],
       [4.77207024e-076, 9.99985902e-001, 1.40976994e-005],
       [1.00000000e+000, 2.82694648e-014, 2.80901009e-023],
       [7.95902449e-253, 3.16688575e-010, 1.00000000e+000],
       [9.99999999e-001, 1.03238247e-009, 1.66125097e-019],
       [5.52307149e-119, 9.82294536e-001

In [42]:
# from sklearn.metrics import RocCurveDisplay
# import matplotlib.pyplot as plt

# # Assuming y_true contains the true labels in one-hot encoded format
# # and y_score contains the predicted probabilities for each class

# # Create a RocCurveDisplay object
# display = RocCurveDisplay.from_predictions(Ytest, y_score)

# # Plot the ROC curve
# display.plot()
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve for Multiclass Classification')
# plt.grid(True)
# plt.show()
