In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import CSV

In [16]:
df = pd.read_excel('BankNote_Authentication.xlsx')
df

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.62160,8.66610,-2.8073,-0.44699,authentic
1,4.54590,8.16740,-2.4586,-1.46210,authentic
2,3.86600,-2.63830,1.9242,0.10645,authentic
3,3.45660,9.52280,-4.0112,-3.59440,authentic
4,0.32924,-4.45520,4.5718,-0.98880,authentic
...,...,...,...,...,...
1357,0.40614,1.34920,-1.4501,-0.55949,counterfeit
1358,-1.38870,-4.87730,6.4774,0.34179,counterfeit
1359,-3.75030,-13.45860,17.5932,-2.77710,counterfeit
1360,-3.56370,-8.38270,12.3930,-1.28230,counterfeit


## Data Preprocessing

check whether there is missing values

In [17]:
df.isnull().sum()

variance    0
skewness    0
curtosis    0
entropy     0
class       0
dtype: int64

Check whether it is numerical data

In [18]:
df.dtypes

variance    float64
skewness    float64
curtosis    float64
entropy     float64
class        object
dtype: object

Since we have 1 column which is not numerical, we need to change it

In [19]:
# 0 is authentic, 1 is counterfeit

df['class'] = df['class'].str.replace('authentic', '0')
df['class'] = df['class'].str.replace('counterfeit', '1')
df['class'] = df['class'].astype(int)

df

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1357,0.40614,1.34920,-1.4501,-0.55949,1
1358,-1.38870,-4.87730,6.4774,0.34179,1
1359,-3.75030,-13.45860,17.5932,-2.77710,1
1360,-3.56370,-8.38270,12.3930,-1.28230,1


Check whether the data have the same mean or not

In [None]:
v1_mean = df["V1"].mean()
v2_mean = df["V2"].mean()
print("V1 mean: " + str(v1_mean))
print("V2 mean: " + str(v2_mean))

Check the linearity of the data

In [None]:
sns.heatmap(df.corr(),annot=True,cmap="magma",fmt='.2f')

In [None]:
print("Banks : ", df.shape[0])
print("features : ", df.shape[1])

In [None]:
X = df.iloc[:,1:3]
y = df.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42)

## Training

Finding the suitable K for K-mean

In [None]:
from sklearn.cluster import KMeans
k_meansclus = range(1,10)
sse = []

for k in k_meansclus:
  km = KMeans(n_clusters =k)
  km.fit(X)
  sse.append(km.inertia_)

In [None]:
plt.title('The Elbow Method')
plt.plot(k_meansclus,sse)
plt.xlabel('K')
plt.ylabel('Sum of the Squared Euclidean')
plt.show()

In [None]:
km1 = KMeans(n_clusters=3)
km1.fit(X)
y_cluster = km1.predict(X)
y_cluster

In [None]:
fig = plt.figure(figsize=(14, 6))
# predicted
ax1 = fig.add_subplot(121)
colours = ['blue','orange', 'green']
for idx in range(3):
    plt.scatter(df[y_cluster == idx]['V1'], df[y_cluster == idx]['V2'], c = colours[idx], edgecolor='k')
plt.scatter(km1.cluster_centers_[:3], km1.cluster_centers_[:3], s = 50, c = 'red', label = 'Centroids', edgecolor='k')
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('Predicted Clusters with the Cluster Centers')

## Model Evaluation

In [None]:
#Preparing the model
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 99)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

cv = KFold(n_splits=10, random_state=1, shuffle=True)

grid_params_knn = { 'n_neighbors' : range(1,20)}

gs = GridSearchCV(KNeighborsClassifier(), grid_params_knn, cv=cv)
gs.fit(X_train, y_train)
gs.best_params_