# Minimum Distance Classifier (MDC)

## Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('Iris.csv')
df.head()
df.sample(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
85,86,6.0,3.4,4.5,1.6,Iris-versicolor
22,23,4.6,3.6,1.0,0.2,Iris-setosa
148,149,6.2,3.4,5.4,2.3,Iris-virginica
57,58,4.9,2.4,3.3,1.0,Iris-versicolor
20,21,5.4,3.4,1.7,0.2,Iris-setosa


In [3]:
df.value_counts('Species')

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

## Assign codes to different classes

In [31]:
df['class'] = np.where(df['Species'] == 'Iris-setosa', 0, np.where(df['Species'] == 'Iris-versicolor', 1, 2))
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,class
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


## Using entire dataset for training

### Classwise Centroid Calculation

In [39]:
#Convert to Numpy
ndata=df.drop(columns=['Species','Id']).to_numpy()
ndata

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3

In [40]:
m,n=ndata.shape
m,n

(150, 5)

In [41]:
#Get no. of classes
k=df['Species'].nunique()
k

3

In [None]:
#Define Centroids Array
centroids=[0]*k

In [48]:
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(ndata[ndata[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[5.006, 3.418, 1.464, 0.244, 0.   ],
       [5.936, 2.77 , 4.26 , 1.326, 1.   ],
       [6.588, 2.974, 5.552, 2.026, 2.   ]])

### Compute euclidean distances  of each row from the centroids

In [44]:
#defining euclidean distance first

def e_dist(a,b):
    return np.sqrt(np.sum(np.square(a-b)))

### Testing Phase: Calculate distance of every point from each centroid and assign it to the nearest class

In [49]:
prediction=[]
for i in range(m):
    dis0=e_dist(ndata[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(ndata[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(ndata[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Training Accuracy

In [51]:
accuracy = np.mean(ndata[:,-1] == prediction)
accuracy*100

92.66666666666666

## Splitting Train test data

### 80-20 Split

In [56]:
train_size=int(0.8*m)
train_size

120

In [57]:
test_size=m-train_size
test_size

30

In [70]:
#Choosing Training set
np.random.seed(13)  # Set seed for reproducibility
rand_ints = np.random.choice(m, size=train_size, replace=False)
train=[ndata[rand_ints[i]] for i in range(train_size)]
train=np.array(train)
train

array([[5.7, 2.8, 4.5, 1.3, 1. ],
       [5.6, 2.9, 3.6, 1.3, 1. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [6.8, 3. , 5.5, 2.1, 2. ],
       [5.5, 4.2, 1.4, 0.2, 0. ],
       [7.7, 3. , 6.1, 2.3, 2. ],
       [6.3, 2.7, 4.9, 1.8, 2. ],
       [5.1, 3.8, 1.6, 0.2, 0. ],
       [5.5, 2.6, 4.4, 1.2, 1. ],
       [6. , 3. , 4.8, 1.8, 2. ],
       [6.3, 2.8, 5.1, 1.5, 2. ],
       [6.3, 2.3, 4.4, 1.3, 1. ],
       [5.1, 3.8, 1.9, 0.4, 0. ],
       [5.8, 2.8, 5.1, 2.4, 2. ],
       [4.8, 3.1, 1.6, 0.2, 0. ],
       [6.1, 3. , 4.9, 1.8, 2. ],
       [6.5, 3. , 5.8, 2.2, 2. ],
       [6.1, 3. , 4.6, 1.4, 1. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [6. , 2.9, 4.5, 1.5, 1. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [7.4, 2.8, 6.1, 1.9, 2. ],
       [5.4, 3. , 4.5, 1.5, 1. ],
       [6.7, 3.3, 5.7, 2.5, 2. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [7.7, 2.6, 6.9, 2.3, 2. ],
       [6.1, 2.9, 4.7, 1.4, 1. ],
       [6.4, 2

In [72]:
test_mask = np.ones(len(ndata), dtype=bool)  # All True initially
test_mask[rand_ints] = False                # Mark training indices as False
test = ndata[test_mask]                     # Take rows not in train
test

array([[4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5. , 3.5, 1.3, 0.3, 0. ],
       [4.8, 3. , 1.4, 0.3, 0. ],
       [5. , 3.3, 1.4, 0.2, 0. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6.9, 3.1, 4.9, 1.5, 1. ],
       [6.6, 2.9, 4.6, 1.3, 1. ],
       [6.1, 2.8, 4.7, 1.2, 1. ],
       [6.4, 2.9, 4.3, 1.3, 1. ],
       [6.6, 3. , 4.4, 1.4, 1. ],
       [6.8, 2.8, 4.8, 1.4, 1. ],
       [5.8, 2.7, 3.9, 1.2, 1. ],
       [6. , 3.4, 4.5, 1.6, 1. ],
       [5.6, 3. , 4.1, 1.3, 1. ],
       [5.1, 2.5, 3. , 1.1, 1. ],
       [5.7, 2.5, 5. , 2. , 2. ],
       [7.7, 3.8, 6.7, 2.2, 2. ],
       [6.9, 3.2, 5.7, 2.3, 2. ],
       [7.7, 2.8, 6.7, 2. , 2. ],
       [6.2, 2.8, 4.8, 1.8, 2. ],
       [6.3, 3.4, 5.6, 2.4, 2. ],
       [6.9, 3.1, 5.4, 2.1, 2. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [6.2, 3

### Calculating centroids using 80% dataset

In [73]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[5.035     , 3.4375    , 1.4775    , 0.2475    , 0.        ],
       [5.85897436, 2.72307692, 4.23846154, 1.32051282, 1.        ],
       [6.58536585, 2.95121951, 5.54146341, 2.00731707, 2.        ]])

### Calculating Training Accuracy

In [75]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

93.33333333333333

### Calculating Test Accuracy

In [76]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

86.66666666666667

### 70-30 Split

In [77]:
train_size=int(0.7*m)
train_size

105

In [78]:
test_size=m-train_size
test_size

45

In [79]:
#Choosing Training set
np.random.seed(13)  # Set seed for reproducibility
rand_ints = np.random.choice(m, size=train_size, replace=False)
train=[ndata[rand_ints[i]] for i in range(train_size)]
train=np.array(train)
train

array([[5.7, 2.8, 4.5, 1.3, 1. ],
       [5.6, 2.9, 3.6, 1.3, 1. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [6.8, 3. , 5.5, 2.1, 2. ],
       [5.5, 4.2, 1.4, 0.2, 0. ],
       [7.7, 3. , 6.1, 2.3, 2. ],
       [6.3, 2.7, 4.9, 1.8, 2. ],
       [5.1, 3.8, 1.6, 0.2, 0. ],
       [5.5, 2.6, 4.4, 1.2, 1. ],
       [6. , 3. , 4.8, 1.8, 2. ],
       [6.3, 2.8, 5.1, 1.5, 2. ],
       [6.3, 2.3, 4.4, 1.3, 1. ],
       [5.1, 3.8, 1.9, 0.4, 0. ],
       [5.8, 2.8, 5.1, 2.4, 2. ],
       [4.8, 3.1, 1.6, 0.2, 0. ],
       [6.1, 3. , 4.9, 1.8, 2. ],
       [6.5, 3. , 5.8, 2.2, 2. ],
       [6.1, 3. , 4.6, 1.4, 1. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [6. , 2.9, 4.5, 1.5, 1. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [7.4, 2.8, 6.1, 1.9, 2. ],
       [5.4, 3. , 4.5, 1.5, 1. ],
       [6.7, 3.3, 5.7, 2.5, 2. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [7.7, 2.6, 6.9, 2.3, 2. ],
       [6.1, 2.9, 4.7, 1.4, 1. ],
       [6.4, 2

In [80]:
test_mask = np.ones(len(ndata), dtype=bool)  # All True initially
test_mask[rand_ints] = False                # Mark training indices as False
test = ndata[test_mask]                     # Take rows not in train
test

array([[4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.5, 3.5, 1.3, 0.2, 0. ],
       [4.4, 3. , 1.3, 0.2, 0. ],
       [5. , 3.5, 1.3, 0.3, 0. ],
       [4.8, 3. , 1.4, 0.3, 0. ],
       [5. , 3.3, 1.4, 0.2, 0. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6.9, 3.1, 4.9, 1.5, 1. ],
       [6.6, 2.9, 4.6, 1.3, 1. ],
       [5.9, 3. , 4.2, 1.5, 1. ],
       [6. , 2.2, 4. , 1. , 1. ],
       [6.1, 2.8, 4. , 1.3, 1. ],
       [6.1, 2.8, 4.7, 1.2, 1. ],
       [6.4, 2.9, 4.3, 1.3, 1. ],
       [6.6, 3. , 4.4, 1.4, 1. ],
       [6.8, 2.8, 4.8, 1.4, 1. ],
       [5.7, 2.6, 3.5, 1. , 1. ],
       [5.8, 2.7, 3.9, 1.2, 1. ],
       [6. , 2.7, 5.1, 1.6, 1. ],
       [6. , 3.4, 4.5, 1.6, 1. ],
       [6.7, 3

### Calculating centroids using 70% dataset

In [81]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[5.04285714, 3.44285714, 1.48857143, 0.25142857, 0.        ],
       [5.82121212, 2.72121212, 4.23636364, 1.32121212, 1.        ],
       [6.53513514, 2.91891892, 5.52702703, 2.0027027 , 2.        ]])

### Calculating Training Accuracy

In [82]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

94.28571428571428

### Calculating Test Accuracy

In [83]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

88.88888888888889

### 60-40 Split

In [84]:
train_size=int(0.6*m)
train_size

90

In [85]:
test_size=m-train_size
test_size

60

In [86]:
#Choosing Training set
np.random.seed(13)  # Set seed for reproducibility
rand_ints = np.random.choice(m, size=train_size, replace=False)
train=[ndata[rand_ints[i]] for i in range(train_size)]
train=np.array(train)
train

array([[5.7, 2.8, 4.5, 1.3, 1. ],
       [5.6, 2.9, 3.6, 1.3, 1. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [6.8, 3. , 5.5, 2.1, 2. ],
       [5.5, 4.2, 1.4, 0.2, 0. ],
       [7.7, 3. , 6.1, 2.3, 2. ],
       [6.3, 2.7, 4.9, 1.8, 2. ],
       [5.1, 3.8, 1.6, 0.2, 0. ],
       [5.5, 2.6, 4.4, 1.2, 1. ],
       [6. , 3. , 4.8, 1.8, 2. ],
       [6.3, 2.8, 5.1, 1.5, 2. ],
       [6.3, 2.3, 4.4, 1.3, 1. ],
       [5.1, 3.8, 1.9, 0.4, 0. ],
       [5.8, 2.8, 5.1, 2.4, 2. ],
       [4.8, 3.1, 1.6, 0.2, 0. ],
       [6.1, 3. , 4.9, 1.8, 2. ],
       [6.5, 3. , 5.8, 2.2, 2. ],
       [6.1, 3. , 4.6, 1.4, 1. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [6. , 2.9, 4.5, 1.5, 1. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [7.4, 2.8, 6.1, 1.9, 2. ],
       [5.4, 3. , 4.5, 1.5, 1. ],
       [6.7, 3.3, 5.7, 2.5, 2. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [7.7, 2.6, 6.9, 2.3, 2. ],
       [6.1, 2.9, 4.7, 1.4, 1. ],
       [6.4, 2

In [87]:
test_mask = np.ones(len(ndata), dtype=bool)  # All True initially
test_mask[rand_ints] = False                # Mark training indices as False
test = ndata[test_mask]                     # Take rows not in train
test

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.5, 3.5, 1.3, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [4.4, 3. , 1.3, 0.2, 0. ],
       [5. , 3.5, 1.3, 0.3, 0. ],
       [4.5, 2.3, 1.3, 0.3, 0. ],
       [4.8, 3. , 1.4, 0.3, 0. ],
       [4.6, 3.2, 1.4, 0.2, 0. ],
       [5. , 3.3, 1.4, 0.2, 0. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6.9, 3.1, 4.9, 1.5, 1. ],
       [6.6, 2.9, 4.6, 1.3, 1. ],
       [5.2, 2.7, 3.9, 1.4, 1. ],
       [5.9, 3. , 4.2, 1.5, 1. ],
       [6. , 2.2, 4. , 1. , 1. ],
       [6.1, 2.8, 4. , 1.3, 1. ],
       [6.3, 2

### Calculating centroids using 60% dataset

In [88]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[5.06071429, 3.475     , 1.49642857, 0.25357143, 0.        ],
       [5.85714286, 2.75      , 4.25357143, 1.325     , 1.        ],
       [6.52941176, 2.90588235, 5.51764706, 1.98529412, 2.        ]])

### Calculating Training Accuracy

In [89]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

93.33333333333333

### Calculating Test Accuracy

In [90]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

90.0

### 50-50 Split

In [91]:
train_size=int(0.5*m)
train_size

75

In [92]:
test_size=m-train_size
test_size

75

In [93]:
#Choosing Training set
np.random.seed(13)  # Set seed for reproducibility
rand_ints = np.random.choice(m, size=train_size, replace=False)
train=[ndata[rand_ints[i]] for i in range(train_size)]
train=np.array(train)
train

array([[5.7, 2.8, 4.5, 1.3, 1. ],
       [5.6, 2.9, 3.6, 1.3, 1. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [6.8, 3. , 5.5, 2.1, 2. ],
       [5.5, 4.2, 1.4, 0.2, 0. ],
       [7.7, 3. , 6.1, 2.3, 2. ],
       [6.3, 2.7, 4.9, 1.8, 2. ],
       [5.1, 3.8, 1.6, 0.2, 0. ],
       [5.5, 2.6, 4.4, 1.2, 1. ],
       [6. , 3. , 4.8, 1.8, 2. ],
       [6.3, 2.8, 5.1, 1.5, 2. ],
       [6.3, 2.3, 4.4, 1.3, 1. ],
       [5.1, 3.8, 1.9, 0.4, 0. ],
       [5.8, 2.8, 5.1, 2.4, 2. ],
       [4.8, 3.1, 1.6, 0.2, 0. ],
       [6.1, 3. , 4.9, 1.8, 2. ],
       [6.5, 3. , 5.8, 2.2, 2. ],
       [6.1, 3. , 4.6, 1.4, 1. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [6. , 2.9, 4.5, 1.5, 1. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [7.4, 2.8, 6.1, 1.9, 2. ],
       [5.4, 3. , 4.5, 1.5, 1. ],
       [6.7, 3.3, 5.7, 2.5, 2. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [7.7, 2.6, 6.9, 2.3, 2. ],
       [6.1, 2.9, 4.7, 1.4, 1. ],
       [6.4, 2

In [94]:
test_mask = np.ones(len(ndata), dtype=bool)  # All True initially
test_mask[rand_ints] = False                # Mark training indices as False
test = ndata[test_mask]                     # Take rows not in train
test

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.5, 3.5, 1.3, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [4.4, 3. , 1.3, 0.2, 0. ],
       [5. , 3.5, 1.3, 0.3, 0. ],
       [4.5, 2.3, 1.3, 0.3, 0. ],
       [4.8, 3. , 1.4, 0.3, 0. ],
       [4.6, 3.2, 1.4, 0.2, 0. ],
       [5. , 3.3, 1.4, 0.2, 0. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6.9, 3.1, 4.9, 1.5, 1. ],
       [5.5, 2.3, 4. , 1.3, 1. ],
       [6.6, 2.9, 4.6, 1.3, 1. ],
       [5.2, 2

### Calculating centroids using 50% dataset

In [95]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[5.096     , 3.516     , 1.504     , 0.256     , 0.        ],
       [5.84782609, 2.76086957, 4.2826087 , 1.34347826, 1.        ],
       [6.54074074, 2.93703704, 5.52222222, 2.00740741, 2.        ]])

### Calculating Training Accuracy

In [96]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

94.66666666666667

### Calculating Test Accuracy

In [97]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    min_dist=min(dis0,dis1,dis2)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    else:
        prediction.append(2)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

89.33333333333333