# Minimum Distance Classifier (MDC)

## Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('Vowel.csv')
df.head()
df.sample(5)

Unnamed: 0,Class,F1,F2,F3
815,6,400,850,2300
715,6,500,1000,2300
478,4,350,800,2630
541,5,500,1550,2450
599,5,400,2000,2600


In [3]:
df.value_counts('Class')

Class
5    207
6    180
3    172
4    151
2     89
1     72
Name: count, dtype: int64

## Assign codes to different classes

In [5]:
df['class'] = df['Class']-1
df.sample(5)

Unnamed: 0,Class,F1,F2,F3,class
352,4,450,1000,1800,3
470,4,300,1000,2400,3
492,5,600,1900,2400,4
637,5,500,1400,2500,4
593,5,550,2100,2700,4


## Using entire dataset for training

### Classwise Centroid Calculation

In [6]:
#Convert to Numpy
ndata=df.drop(columns=['Class']).to_numpy()
ndata

array([[ 700, 1500, 2600,    0],
       [ 550, 1550, 2400,    0],
       [ 700, 1500, 2600,    0],
       ...,
       [ 500, 1000, 2800,    5],
       [ 500,  900, 2800,    5],
       [ 500,  950, 2700,    5]], dtype=int64)

In [7]:
m,n=ndata.shape
m,n

(871, 4)

In [8]:
#Get no. of classes
k=df['class'].nunique()
k

6

In [9]:
#Define Centroids Array
centroids=[0]*k

In [10]:
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(ndata[ndata[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[6.03472222e+02, 1.46805556e+03, 2.37930556e+03, 0.00000000e+00],
       [6.98314607e+02, 1.24044944e+03, 2.33820225e+03, 1.00000000e+00],
       [3.42209302e+02, 2.20203488e+03, 2.80505814e+03, 2.00000000e+00],
       [3.58145695e+02, 9.78476821e+02, 2.49450331e+03, 3.00000000e+00],
       [5.04830918e+02, 1.86657005e+03, 2.61743961e+03, 4.00000000e+00],
       [4.81944444e+02, 1.05727778e+03, 2.50161111e+03, 5.00000000e+00]])

### Compute euclidean distances  of each row from the centroids

In [11]:
#defining euclidean distance first

def e_dist(a,b):
    return np.sqrt(np.sum(np.square(a-b)))

### Testing Phase: Calculate distance of every point from each centroid and assign it to the nearest class

In [14]:
prediction=[]
for i in range(m):
    dis0=e_dist(ndata[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(ndata[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(ndata[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(ndata[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(ndata[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(ndata[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
prediction

array([0, 0, 0, 4, 0, 5, 5, 4, 0, 0, 4, 4, 4, 4, 0, 0, 0, 4, 0, 0, 0, 0,
       0, 5, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 4, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 4, 4, 1,
       0, 0, 1, 4, 0, 1, 1, 1, 1, 5, 1, 5, 1, 1, 5, 1, 5, 1, 5, 1, 0, 5,
       1, 5, 5, 5, 5, 1, 1, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 0, 4,
       4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

### Training Accuracy

In [15]:
accuracy = np.mean(ndata[:,-1] == prediction)
accuracy*100

71.4121699196326

## Splitting Train test data

### 80-20 Split

In [17]:
X=df.drop(columns=['Class','class'])
Y=df['class']
#80% train, 20%test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=42,stratify=Y)

In [22]:
#Convert to Numpy
train=pd.concat([X_train,Y_train],axis=1).to_numpy()
test=pd.concat([X_test,Y_test],axis=1).to_numpy()

In [31]:
train.shape

(696, 4)

In [32]:
test.shape

(175, 4)

### Calculating centroids using 80% dataset

In [23]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[6.07758621e+02, 1.47155172e+03, 2.39293103e+03, 0.00000000e+00],
       [6.97887324e+02, 1.24718310e+03, 2.32676056e+03, 1.00000000e+00],
       [3.43138686e+02, 2.19343066e+03, 2.79313869e+03, 2.00000000e+00],
       [3.57272727e+02, 9.76446281e+02, 2.49644628e+03, 3.00000000e+00],
       [5.06060606e+02, 1.86757576e+03, 2.61309091e+03, 4.00000000e+00],
       [4.85069444e+02, 1.05875000e+03, 2.49729167e+03, 5.00000000e+00]])

### Calculating Training Accuracy

In [33]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(train[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(train[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(train[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

71.4080459770115

### Calculating Test Accuracy

In [34]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(test[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(test[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(test[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

68.57142857142857

### 70-30 Split

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=42,stratify=Y)

In [36]:
#Convert to Numpy
train=pd.concat([X_train,Y_train],axis=1).to_numpy()
test=pd.concat([X_test,Y_test],axis=1).to_numpy()

### Calculating centroids using 70% dataset

In [37]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[6.09000000e+02, 1.47500000e+03, 2.40060000e+03, 0.00000000e+00],
       [6.96774194e+02, 1.25322581e+03, 2.33145161e+03, 1.00000000e+00],
       [3.42583333e+02, 2.19291667e+03, 2.78991667e+03, 2.00000000e+00],
       [3.61603774e+02, 9.75943396e+02, 2.49122642e+03, 3.00000000e+00],
       [5.04827586e+02, 1.86793103e+03, 2.62110345e+03, 4.00000000e+00],
       [4.83333333e+02, 1.05523810e+03, 2.48873016e+03, 5.00000000e+00]])

### Calculating Training Accuracy

In [38]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(train[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(train[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(train[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

70.44334975369459

### Calculating Test Accuracy

In [40]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(test[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(test[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(test[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

70.99236641221374

### 60-40 Split

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,random_state=42,stratify=Y)

In [45]:
#Convert to Numpy
train=pd.concat([X_train,Y_train],axis=1).to_numpy()
test=pd.concat([X_test,Y_test],axis=1).to_numpy()

### Calculating centroids using 60% dataset

In [46]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[6.16279070e+02, 1.47674419e+03, 2.39790698e+03, 0.00000000e+00],
       [6.99056604e+02, 1.24716981e+03, 2.33867925e+03, 1.00000000e+00],
       [3.42815534e+02, 2.18786408e+03, 2.78155340e+03, 2.00000000e+00],
       [3.58791209e+02, 9.69780220e+02, 2.49494505e+03, 3.00000000e+00],
       [5.08870968e+02, 1.85564516e+03, 2.62209677e+03, 4.00000000e+00],
       [4.82407407e+02, 1.05333333e+03, 2.46805556e+03, 5.00000000e+00]])

### Calculating Training Accuracy

In [47]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(train[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(train[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(train[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

71.26436781609196

### Calculating Test Accuracy

In [None]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(test[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(test[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(test[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

72.77936962750717

### 50-50 Split

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5,random_state=42,stratify=Y)

In [52]:
#Convert to Numpy
train=pd.concat([X_train,Y_train],axis=1).to_numpy()
test=pd.concat([X_test,Y_test],axis=1).to_numpy()

### Calculating centroids using 60% dataset

In [53]:
#Define Centroids Array
centroids=[0]*k
#Centroids calculation
for i in range(k):
    centroids[i]=np.mean(train[train[:,-1]==i],axis=0)
centroids=np.array(centroids)
centroids

array([[6.22222222e+02, 1.47222222e+03, 2.38500000e+03, 0.00000000e+00],
       [6.97777778e+02, 1.24333333e+03, 2.34222222e+03, 1.00000000e+00],
       [3.41395349e+02, 2.19244186e+03, 2.79848837e+03, 2.00000000e+00],
       [3.56666667e+02, 9.67333333e+02, 2.48946667e+03, 3.00000000e+00],
       [5.12135922e+02, 1.84320388e+03, 2.61349515e+03, 4.00000000e+00],
       [4.81111111e+02, 1.04400000e+03, 2.46588889e+03, 5.00000000e+00]])

### Calculating Training Accuracy

In [54]:
prediction=[]
for i in range(len(train)):
    dis0=e_dist(train[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(train[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(train[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(train[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(train[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(train[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(train[:,-1] == prediction)
accuracy*100

72.8735632183908

### Calculating Test Accuracy

In [55]:
prediction=[]
for i in range(len(test)):
    dis0=e_dist(test[i,:n-1],centroids[0,:n-1])
    dis1=e_dist(test[i,:n-1],centroids[1,:n-1])
    dis2=e_dist(test[i,:n-1],centroids[2,:n-1])
    dis3=e_dist(test[i,:n-1],centroids[3,:n-1])
    dis4=e_dist(test[i,:n-1],centroids[4,:n-1])
    dis5=e_dist(test[i,:n-1],centroids[5,:n-1])
    min_dist=min(dis0,dis1,dis2,dis3,dis4,dis5)
    if dis0==min_dist:
        prediction.append(0)
    elif dis1==min_dist:
        prediction.append(1)
    elif dis2==min_dist:
        prediction.append(2)
    elif dis3==min_dist:
        prediction.append(3)
    elif dis4==min_dist:
        prediction.append(4)
    else:
        prediction.append(5)
prediction=np.array(prediction)
accuracy = np.mean(test[:,-1] == prediction)
accuracy*100

73.39449541284404