In [2]:
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import load_iris
import argparse

In [3]:
#giving the dictionary of models
models = {
	"knn": KNeighborsClassifier(n_neighbors=4),
	"naive_bayes": GaussianNB(),
	"logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
	"svm": SVC(kernel="rbf", gamma="auto"),
	"decision_tree": DecisionTreeClassifier(),
	"random_forest": RandomForestClassifier(n_estimators=100),
	"mlp": MLPClassifier()
}

In [4]:
#Import pandas
import pandas as pd

In [5]:
#load data ecoli
dataset = pd.read_csv("ecoli.csv")

In [6]:
dataset.head()

Unnamed: 0,Sequence Name,mcg,gvh,lip,chg,aac,alm1,alm2,class
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [7]:
dataset.keys()

Index(['Sequence Name', 'mcg', 'gvh', 'lip', 'chg', 'aac', 'alm1', 'alm2',
       'class'],
      dtype='object')

In [8]:
X = dataset[['mcg', 'gvh', 'lip', 'chg', 'aac', 'alm1', 'alm2']] .values  #.astype(float)
X[0:5]

array([[0.49, 0.29, 0.48, 0.5 , 0.56, 0.24, 0.35],
       [0.07, 0.4 , 0.48, 0.5 , 0.54, 0.35, 0.44],
       [0.56, 0.4 , 0.48, 0.5 , 0.49, 0.37, 0.46],
       [0.59, 0.49, 0.48, 0.5 , 0.52, 0.45, 0.36],
       [0.23, 0.32, 0.48, 0.5 , 0.55, 0.25, 0.35]])

In [9]:
y = dataset['class'].values
y[0:5]

array(['cp', 'cp', 'cp', 'cp', 'cp'], dtype=object)

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=1)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (252, 7) (252,)
Test set: (84, 7) (84,)


In [11]:
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='uniform')

In [12]:
yhat = neigh.predict(X_test)
yhat[0:5]

array(['cp', 'cp', 'im', 'im', 'im'], dtype=object)

In [13]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.876984126984127
Test set Accuracy:  0.8928571428571429


In [14]:
import numpy as np
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

array([0.83333333, 0.86904762, 0.89285714, 0.89285714, 0.89285714,
       0.88095238, 0.89285714, 0.89285714, 0.89285714])

In [15]:
import matplotlib.pyplot as plt
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

<Figure size 640x480 with 1 Axes>

In [16]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

The best accuracy was with 0.8928571428571429 with k= 3


In [17]:
test = [0.49, 0.29, 0.48, 0.5, 0.56, 0.24, 0.35] #masukan yang ingin diprediksi kelasnya
test = np.asarray(test)
test = test.reshape(1,-1)
tespredict = neigh.predict(test)
print("Hasil Prediksi :", tespredict)

Hasil Prediksi : ['cp']


In [18]:
dataset['class'].value_counts()

cp     143
im      77
pp      52
imU     35
om      20
omL      5
imS      2
imL      2
Name: class, dtype: int64

In [19]:
target_names = ["cp", "im", "pp", "imU", "om", "omL"]

In [20]:
predictions = neigh.predict(X_test)
print(classification_report(y_test, predictions, target_names=target_names))


              precision    recall  f1-score   support

          cp       0.97      0.97      0.97        39
          im       0.88      0.78      0.82        18
          pp       0.57      0.67      0.62         6
         imU       1.00      0.75      0.86         4
          om       0.00      0.00      0.00         0
         omL       0.89      0.94      0.91        17

   micro avg       0.89      0.89      0.89        84
   macro avg       0.72      0.68      0.70        84
weighted avg       0.91      0.89      0.90        84



  'recall', 'true', average, warn_for)
