#  Data Classification : K-Nearest Neighbour Classifier and Bayes Classifier with Unimodal Gaussian Density

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np

Using Steel Plates Faults Data Set as a csv file

In [2]:
df = pd.read_csv('SteelPlateFaults-2class.csv')
df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Class
0,1325,1339,30207,30238,268,29,31,25809,79,124,...,0.4828,1.0,1.0,2.4281,1.1461,1.4914,0.5484,-0.2476,0.7065,1
1,1,16,55572,55629,370,48,62,39293,27,119,...,0.3125,0.9194,1.0,2.5682,1.1761,1.7559,0.7368,-0.1703,0.9755,1
2,1323,1333,68445,68506,330,48,61,33449,90,119,...,0.2083,1.0,1.0,2.5185,1.0,1.7853,0.8361,-0.2081,0.8861,1
3,1324,1333,75642,75681,207,25,39,21354,93,124,...,0.36,1.0,1.0,2.316,0.9542,1.5911,0.7692,-0.1941,0.5805,1
4,1324,1335,97132,97213,594,55,81,61608,93,125,...,0.2,1.0,1.0,2.7738,1.0414,1.9085,0.8642,-0.1897,0.9806,1


Splitting the data of each class from SteelPlateFaults-2class.csv into train data and test data

In [9]:
#In class '0'
df_0 = df[df["Class"]==0]

[X_train_0, X_test_0,X_label_train_0,X_label_test_0] = train_test_split(df_0,
                                   df_0['Class'], test_size=0.3,
                                   random_state=42, shuffle=True)

#In class '1'
df_1 = df[df["Class"]==1]

[X_train_1, X_test_1,
  X_label_train_1,
  X_label_test_1] = train_test_split(df_1,
                                   df_1['Class'], test_size=0.3,
                                   random_state=42, shuffle=True)                            
## Joining the training of class 0 and 1
# and testing data of class 0 and 1
[X_train, X_test, X_label_train,
 X_label_test] = [X_train_0.append(X_train_1),
                  X_test_0.append(X_test_1),
                  X_label_train_0.append(X_label_train_1),
                  X_label_test_0.append(X_label_test_1)]

Saving the training and testing data in CSV files

In [10]:
X_train.to_csv('SteelPlateFaults-2class-train.csv', index=False)
X_test.to_csv('SteelPlateFaults-2class-test.csv', index=False)

# KNN Classifier

Using the KNN Classifier to classify given dataset

def knn_classifier(x_train, x_test, x_label_test, x_label_train):
    for i in range(1, 6, 2):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x_train, x_label_train)
        
        # Printing the Accuracies and Confusion matrix for each K
        print(' K = {:}'.format(i))
        print(' Accuracy : {:.3f}'
             .format(knn.score(x_test, x_label_test)))
        print(' Confusion Matrix :\n')
        print(confusion_matrix(x_label_test, knn.predict(x_test)),'\n')
        if(i == 5):
            return knn.score(x_test, x_label_test)
    return 0

Performing the KNN classification technique

In [12]:
best_accuracy_knn = knn_classifier(X_train[list(df)[:-1]], X_test[list(df)[:-1]], X_label_test, X_label_train)

 K = 1
 Accuracy : 0.869
 Confusion Matrix :

[[ 93  25]
 [ 19 200]] 

 K = 3
 Accuracy : 0.896
 Confusion Matrix :

[[ 92  26]
 [  9 210]] 

 K = 5
 Accuracy : 0.893
 Confusion Matrix :

[[ 92  26]
 [ 10 209]] 



Using KNN Classifier after normalizing all the attributes (except class attribute) of SteelPlateFaults-train.csv
using Min-Max normalization to transform the data in the range [0-1]

In [13]:
min_max_scaler = MinMaxScaler()
X_train_normalised = min_max_scaler.fit_transform(X_train)
X_train_normalised = pd.DataFrame(X_train_normalised)
X_train_normalised.rename(columns={i: list(df)[i] for i in range(len(list(df)))}, inplace=True)
X_train_normalised.to_csv('SteelPlateFaults-2class-train-normalised.csv', index=False)

# Dropping the tuples having out of bound values
# (As compared with the min. and max. from training data)
drop_tuple_indexes = set()
for i in range(len(list(df))):
    for j in X_test.index:
        if(X_test[list(X_test)[i]][j] < min_max_scaler.data_min_[i]):
            drop_tuple_indexes.add(j)
        if(X_test[list(X_test)[i]][j] > min_max_scaler.data_max_[i]):
            drop_tuple_indexes.add(j)

X_test_normalised = min_max_scaler.fit_transform(X_test.drop(list(drop_tuple_indexes), axis=0))
X_test_normalised = pd.DataFrame(X_test_normalised)
X_test_normalised.rename(columns={i: list(df)[i] for i in range(len(list(df)))}, inplace=True)
X_test_normalised.to_csv('SteelPlateFaults-2class-test-normalised.csv', index=False)

 Appying the KNN classfication technique

In [14]:
best_accuracy_knn_normalised = knn_classifier(X_train_normalised[list(df)[:-1]],
                                              X_test_normalised[list(df)[:-1]],
                                              X_test_normalised['Class'],
                                              X_train_normalised['Class'])

 K = 1
 Accuracy : 0.964
 Confusion Matrix :

[[109   7]
 [  5 208]] 

 K = 3
 Accuracy : 0.976
 Confusion Matrix :

[[111   5]
 [  3 210]] 

 K = 5
 Accuracy : 0.979
 Confusion Matrix :

[[111   5]
 [  2 211]] 



#  Bayes Classifier

Building a Bayes Classifer with given training data
and testing on the testing data

In [15]:
# Dimension of the training and testing data
d = 27

# Function to calculate likelihood of a class for given test sample
def likelihood(x, mean, cov_matrix):
    x = np.array(x)
    mean = np.array(mean)
    cov_matrix = np.array(cov_matrix)
    val = (1/(((2*np.pi)**(d/2))*(np.linalg.det(cov_matrix)**0.5)))
    val *= np.exp(-0.5*np.dot(np.dot((x - mean).T, np.linalg.inv(cov_matrix)), (x - mean)))
    return val

# Priors of each class from the training data
prior_0 = list(X_train['Class']).count(0)/len(X_train['Class'])
prior_1 = list(X_train['Class']).count(1)/len(X_train['Class'])



df_0 = df_0[list(df_0)[:-1]]
df_1 = df_1[list(df_1)[:-1]]




# Mean matrices for each class
mean_0 = df_0.mean().to_numpy()
mean_1 = df_1.mean().to_numpy()


# Covariance matrices for each class
cov_matrix_0 = df_0.cov().to_numpy()
cov_matrix_1 = df_1.cov().to_numpy()



# Predicted test labels
X_label_test_predicted = []
for i in np.array(X_test[list(X_test)[:-1]]):
    likl_0 = likelihood(i, mean_0, cov_matrix_0)
    likl_1 = likelihood(i, mean_1, cov_matrix_1)
    posterior_0 = (likl_0 * prior_0)/ (likl_0 * prior_0 + likl_1 * prior_1)
    posterior_1 = (likl_1 * prior_1)/ (likl_0 * prior_0 + likl_1 * prior_1)
    if(posterior_0 > posterior_1):
        X_label_test_predicted.append(0)
    else:
        X_label_test_predicted.append(1)

  val = (1/(((2*np.pi)**(d/2))*(np.linalg.det(cov_matrix)**0.5)))
  val *= np.exp(-0.5*np.dot(np.dot((x - mean).T, np.linalg.inv(cov_matrix)), (x - mean)))


In [17]:
print(confusion_matrix(X_label_test, X_label_test_predicted))
print('\n Accuracy: %.2f'%(accuracy_score(X_label_test, X_label_test_predicted)))

[[  0 118]
 [  0 219]]

 Accuracy: 0.65


# Tablulating the best results of each Classifier

In [18]:
res = pd.DataFrame({'KNN':best_accuracy_knn,
                    'KNN Normalised':best_accuracy_knn_normalised,
                    'Bayes':accuracy_score(X_label_test, X_label_test_predicted)}.items(), columns=['Classifier', 'Accuracy'])
print('\n',res)


        Classifier  Accuracy
0             KNN  0.893175
1  KNN Normalised  0.978723
2           Bayes  0.649852


Here we can see KNN Classifier when used on normalized dataset gives the best result than the other two.