In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

df  =  pd.read_csv('SteelPlateFaults-2class.csv') #reading the csv file

data_0 = df[df["Class"]== 0] #creating the database where class  =  0
data_1 = df[df["Class"]== 1] #creating the database where class  =  1

df_class_0 = data_0.Class # extracting the class column for using in the train_test_split
df_class_1 = data_1.Class # extracting the class column for using in the train_test_split

del data_1['Class'] # deleting the class column 
del data_0['Class'] # deleting the class column

[X0_train, X0_test, X0_label_train, X0_label_test] = train_test_split(data_0,df_class_0, test_size = 0.3, random_state = 42,shuffle = True) # splitting the data of class  =  0 into trains and test datasets
[X1_train, X1_test, X1_label_train, X1_label_test] = train_test_split(data_1,df_class_1, test_size = 0.3, random_state = 42,shuffle = True) # splitting the data of class  =  1 into trains and test datasets

X_train = pd.concat((X0_train,X1_train),axis = 0) # creating the train dataset by concatenating the train of class  =  0  and class  =  1
X_label_train = pd.concat((X0_label_train,X1_label_train),axis = 0) # creating the new class list for the training model

X_test = pd.concat((X0_test,X1_test),axis = 0) # creating the test dataset by concatenating the train of class  =  0  and class  =  1
X_label_test = pd.concat((X0_label_test,X1_label_test),axis = 0) # creating the new class list for testing

X_train,X_test = pd.concat((X_train,X_label_train), axis  =  1),pd.concat((X_test,X_label_test), axis  =  1) # recreating the complete original dataset

X_train.to_csv("SteelPlateFaults-train.csv",index = False) # creating a csv for the training data
X_test.to_csv("SteelPlateFaults-test.csv",index = False) # creating a csv for the testing data

k = [1,3,5]
Accuracy = [] # We will append all the accuracies obtained into this list and use it for part 4 of the question

for i in k:
    neigh  =  KNeighborsClassifier(n_neighbors = i) # specifying the k nearest neighbours
    neigh.fit(X_train, X_label_train) # applying knn to X_train and X_label_train
    
    predict_data = neigh.predict(X_test) # predicting the class using the train dataset
    con_m  =  confusion_matrix(X_label_test, predict_data) # creating the confusion matrix

    print('Confusion Matrix for k = ',i,':')
    print(con_m)
    print()
    print('Accuracy score for k = ',i,' :')
    print(accuracy_score(X_label_test,predict_data)) # obtaining the accuracy using predicted data and the testing data
    Accuracy.append(accuracy_score(X_label_test,predict_data)) # appending to the accuracy list for final use

Confusion Matrix for k =  1 :
[[ 93  25]
 [ 19 200]]

Accuracy score for k =  1  :
0.8694362017804155
Confusion Matrix for k =  3 :
[[ 92  26]
 [  9 210]]

Accuracy score for k =  3  :
0.8961424332344213
Confusion Matrix for k =  5 :
[[ 92  26]
 [ 10 209]]

Accuracy score for k =  5  :
0.8931750741839762


In [2]:
df_train = pd.read_csv('SteelPlateFaults-train.csv') # reading the train dataset created earlier
df_test = pd.read_csv('SteelPlateFaults-test.csv') # reading the test dataset created earlier

for column in df_test:
    df_test[column] = (df_test[column]-df_train[column].min())/(df_train[column].max()-df_train[column].min()) # normalizing the test dataset

for column in df_train:
    df_train[column] = (df_train[column]-df_train[column].min())/(df_train[column].max()-df_train[column].min()) # normalizing the train dataset

df_train.to_csv("SteelPlateFaults-train-Normalised.csv",index = False) # creating a csv for the normalized training data
df_test.to_csv("SteelPlateFaults-test-Normalised.csv",index = False) # creating a csv for the normalized testing data

df_train_1 = pd.read_csv("SteelPlateFaults-train-Normalised.csv")
del df_train_1["Class"] # deleting the class column from the train dataset

df_test_1 = pd.read_csv('SteelPlateFaults-test-Normalised.csv')
del df_test_1["Class"] # deleting the class column from the test dataset

k = [1,3,5]
for i in k:
    neigh  =  KNeighborsClassifier(n_neighbors = i) # specifying the k nearest neighbours
    neigh.fit(df_train_1, df_train.Class) # applying knn to X_train and X_label_train

    predict_data = neigh.predict(df_test_1) # predicting the class using the train dataset
    con_m  =  confusion_matrix(df_test.Class, predict_data) # creating the confusion matrix

    print('Confusion Matrix for k = ',i,':')
    print(con_m)
    print()
    print('Accuracy score for k = ',i,' :')
    print(accuracy_score(df_test.Class,predict_data)) # obtaining the accuracy using predicted data and the testing data
    Accuracy.append(accuracy_score(df_test.Class,predict_data)) # appending to the accuracy list for final use
    

Confusion Matrix for k =  1 :
[[111   7]
 [  6 213]]

Accuracy score for k =  1  :
0.9614243323442137
Confusion Matrix for k =  3 :
[[112   6]
 [  4 215]]

Accuracy score for k =  3  :
0.9703264094955489
Confusion Matrix for k =  5 :
[[112   6]
 [  3 216]]

Accuracy score for k =  5  :
0.973293768545994


In [3]:
X_train = pd.read_csv('SteelPlateFaults-train.csv')
X_test = pd.read_csv('SteelPlateFaults-test.csv')

del X_train['TypeOfSteel_A400']
del X_train['TypeOfSteel_A300']
del X_test['TypeOfSteel_A400']
del X_test['TypeOfSteel_A300']

df_test_class = X_test['Class']
del X_test['Class']

X_train_class0 = X_train[X_train['Class']== 0]
X_train_class1 = X_train[X_train['Class']== 1]

del X_train_class0['Class']
del X_train_class1['Class']

Mean_C0 = X_train_class0.mean().values #Getting the mean of the values with class 0
Covariance_Class0 = np.cov(X_train_class0.T) #Getting the covariance matrix with class 0
Mean_C1 = X_train_class1.mean().values  #Getting the mean of the values with class 1
Covariance_Class1 = np.cov(X_train_class1.T) #Getting the covariance matrix with class 0

P_C0 = len(X_train_class0)/(len(X_train_class0)+len(X_train_class1)) # Calculating the prior of the probability of C = 0
P_C1 = len(X_train_class1)/(len(X_train_class0)+len(X_train_class1)) # Calculating the prior of the probability of C = 1

d = len(X_test.columns)


# for i in range(len(Covariance_Class1)):
#     for j in range(len(Covariance_Class1)):
#         if(i!= j):
#             Covariance_Class1[i][j] = 0


# for i in range(len(Covariance_Class0)):
#     for j in range(len(Covariance_Class0)):
#         if(i!= j):
#             Covariance_Class0[i][j] = 0
d = len(X_test.columns)-1

Predicted_class = []
for x in X_test[X_test.columns].values:
    p_x_C0 = 1/(((2*np.pi)**(d/2))*np.linalg.det(Covariance_Class0)**0.5)*np.e**(-0.5*np.dot(np.dot((x-Mean_C0).T,np.linalg.inv(Covariance_Class0)),(x-Mean_C0))) #applying bayes to get likelihood of class 0 
    p_x_C1 = 1/(((2*np.pi)**(d/2))*np.linalg.det(Covariance_Class1)**0.5)*np.e**(-0.5*np.dot(np.dot((x-Mean_C1).T,np.linalg.inv(Covariance_Class1)),(x-Mean_C1))) #applying bayes to get likelihood of class 1 
    P_x = p_x_C0*P_C0+p_x_C1*P_C1 # Getting the evidence i.e. total probability
    P_C0_x = p_x_C0*P_C0/P_x # Getting the posterior probability for class 0
    P_C1_x = p_x_C1*P_C1/P_x # Getting the posterior probability for class 1
    if (P_C0_x>P_C1_x):
        Predicted_class.append(0)
    else:
        Predicted_class.append(1)


print('Confusion Matrix :')
print(confusion_matrix(df_test_class,Predicted_class))
print('Accuracy score :')
print((accuracy_score(df_test_class,Predicted_class)))
Accuracy.append(accuracy_score(df_test_class,Predicted_class))

Confusion Matrix :
[[102  16]
 [  2 217]]
Accuracy score :
0.9465875370919882


In [4]:
Accuracy_1 = max(Accuracy[0],Accuracy[1],Accuracy[2])
Accuracy_2 = max(Accuracy[3],Accuracy[4],Accuracy[5])
Accuracy_3 = Accuracy[6]

data = {'Process':['original knn = 5','normalized knn = 5','bayesian classifier'],
        'Accuracy':[Accuracy[2],Accuracy[5],Accuracy[6]]} 

q4 = pd.DataFrame(data)

print(q4)

               Process  Accuracy
0     original knn = 5  0.893175
1   normalized knn = 5  0.973294
2  bayesian classifier  0.946588
