In [1]:
# importing required packages and models and algorithms

import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder  # requires to convert string type of data into int
from typing import Dict

from sklearn.model_selection import train_test_split   # requires to split the data into train and test

from sklearn.linear_model import LogisticRegression # this are algorithms to classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB

from sklearn.metrics import accuracy_score,precision_score,confusion_matrix  # to check the performance of model
from sklearn.model_selection import cross_val_score  # to check the accuracy score of model by using average

import pickle  # to dump the selected model

In [2]:
data = pd.read_csv("Iris.csv")  #load data into 'data'

In [3]:
data.head()   #shows the first 5 rows of data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
data.isnull().sum()   # return the sum of null values in all columns

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [5]:
data.describe() # it describes the overall mathematical relationships

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
data.info()  #returns the dtype and null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
data.shape  # returns the shape of data

(150, 5)

In [9]:
label_encoder = LabelEncoder()  # assigns the label encoder class to varibale

In [11]:
X_train,X_test,y_train,y_test = train_test_split(data.drop(columns=['species']),
                                                 data['species'],
                                                 test_size=0.2,
                                                random_state=17)   # split the date into train and test

In [12]:
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
29,4.7,3.2,1.6,0.2
98,5.1,2.5,3.0,1.1
37,4.9,3.1,1.5,0.1
5,5.4,3.9,1.7,0.4
81,5.5,2.4,3.7,1.0
...,...,...,...,...
22,4.6,3.6,1.0,0.2
57,4.9,2.4,3.3,1.0
134,6.1,2.6,5.6,1.4
143,6.8,3.2,5.9,2.3


In [13]:
y_train

29         Iris-setosa
98     Iris-versicolor
37         Iris-setosa
5          Iris-setosa
81     Iris-versicolor
            ...       
22         Iris-setosa
57     Iris-versicolor
134     Iris-virginica
143     Iris-virginica
111     Iris-virginica
Name: species, Length: 120, dtype: object

In [14]:
y_train = label_encoder.fit_transform(y_train)  #converts object type of target data into int

In [15]:
y_test = label_encoder.fit_transform(y_test)

In [78]:
# to store the int number for each category assigned by label encoder
def get_label_encoder_mapping(label_encoder: LabelEncoder) -> Dict[int, str]:
    try:
        # Create a dictionary mapping of encoded values to original categories
        mapping = dict(zip(range(len(label_encoder.classes_)), 
                         label_encoder.classes_))
        return mapping
    except AttributeError:
        raise AttributeError("LabelEncoder must be fitted before getting mapping")

mapping = get_label_encoder_mapping(label_encoder)


In [18]:
# creating object of the imported class
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
gnb = GaussianNB()
mnb = MultinomialNB()

In [19]:
# to train and store and predict the model performaace on test dataset
accuracy_bar = []
precision_bar = []
cross_val_score_list = [] 
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    # cross validation using cross_val_score
    cvs = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred,average='micro')
    accuracy_bar.append(accuracy)
    precision_bar.append(precision)
    cross_val_score_list.append(cvs)
    confusion = confusion_matrix(y_test,y_pred)
    return cvs,accuracy,precision,confusion

In [20]:
train_classifier(knc,X_train,y_train,X_test,y_test) #call the function

(0.9666666666666668,
 0.9333333333333333,
 0.9333333333333333,
 array([[ 7,  0,  0],
        [ 0, 11,  0],
        [ 0,  2, 10]], dtype=int64))

In [21]:
train_classifier(dtc,X_train,y_train,X_test,y_test)  #call the function

(0.95,
 0.9666666666666667,
 0.9666666666666667,
 array([[ 7,  0,  0],
        [ 0, 11,  0],
        [ 0,  1, 11]], dtype=int64))

In [22]:
train_classifier(lrc,X_train,y_train,X_test,y_test)   #call the function

(0.9666666666666668,
 0.9333333333333333,
 0.9333333333333333,
 array([[ 7,  0,  0],
        [ 0,  9,  2],
        [ 0,  0, 12]], dtype=int64))

In [23]:
train_classifier(rfc,X_train,y_train,X_test,y_test)   #call the function

(0.9416666666666668,
 0.9666666666666667,
 0.9666666666666667,
 array([[ 7,  0,  0],
        [ 0, 11,  0],
        [ 0,  1, 11]], dtype=int64))

In [24]:
train_classifier(gnb,X_train,y_train,X_test,y_test)   #call the function

(0.95,
 0.9666666666666667,
 0.9666666666666667,
 array([[ 7,  0,  0],
        [ 0, 11,  0],
        [ 0,  1, 11]], dtype=int64))

In [25]:
train_classifier(mnb,X_train,y_train,X_test,y_test)  #call the function

(0.9666666666666668,
 0.9666666666666667,
 0.9666666666666667,
 array([[ 7,  0,  0],
        [ 0, 11,  0],
        [ 0,  1, 11]], dtype=int64))

In [129]:
'''Testing................'''

'Testing................'

In [27]:
pickle.dump(mnb,open('Multinomial.pkl','wb'))   #dump the seleted model

In [56]:
mnb_model = pickle.load(open('Multinomial.pkl','rb'))    #read the selected model for prediction

In [139]:
inp = np.array([2.1,3.5,1.4,5.2]).reshape(1,4)  

In [141]:
op = tuple(mnb_model.predict(inp))  #test the given input on model



In [143]:
mapping[op[0]]  #print the output

'Iris-virginica'