In [41]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# os.chdir('/content/drive/My Drive/datasets')
os.listdir('.')
filename = 'iris.csv'
dataset = pd.read_csv(filename)

In [92]:
scaler = preprocessing.StandardScaler()

feature_used= dataset.drop(['Id','Species'],axis=1)
label =dataset['Species']

#normalkan data
feature_scaled =scaler.fit_transform(feature_used.values)
feature_train, feature_test,label_train, label_test = train_test_split(feature_scaled, label, test_size=0.15,random_state=3) 
# 85% training, 15% testing
# dengan random_state saya setting nilai 10, agar tidak terjadi perubahan

# **No. 1 Pergantian RandomForest Estimator**

In [93]:
estimator=[50,100,200]
for index,value in enumerate(estimator):
  rf_model = RandomForestClassifier(n_estimators=value, bootstrap=True, max_features = "sqrt")
  rf_model.fit(feature_train, label_train)
  prediction = rf_model.predict(feature_test)
  print('Accuracy of Random Forest with estimators {} on test set: {:.2f}'.format(value,rf_model.score(feature_test, label_test)))
  print(f1_score(label_test, prediction, average="macro"))
  print(precision_score(label_test, prediction, average="macro"))
  print(recall_score(label_test, prediction, average="macro"))

Accuracy of Random Forest with estimators 50 on test set: 1.00
1.0
1.0
1.0
Accuracy of Random Forest with estimators 100 on test set: 1.00
1.0
1.0
1.0
Accuracy of Random Forest with estimators 200 on test set: 1.00
1.0
1.0
1.0


Berdasarkan hasil diatas tidak ada perbedaan antara estimator 50,100, dan 200. ini dapat disebabkan karena dataset yang kita gunakan sangat baik sehingga tidak terlihat bias sama sekali. 

tetapi bila di running berulang-ulang maka estimator 50 akan menunjukan perubahan pada accuracy menjadi 0.95, karena random forest adalah algoritma yang menilai secara random berdasarkan tingkat kedalaman. karena itu untuk estimator 50 masih bisa terjadi perubahan meskipun tidak terlalu signifikan.

# **No. 2 menggunakan Kernel berbeda dalam SVC**

In [94]:
kernels=["linear","poly"]
for index,value in enumerate(kernels):
  svc_classifier = SVC(kernel = value)
  svc_classifier.fit(feature_train, label_train)
  # prediction
  prediction = svc_classifier.predict(feature_test)

  #hasil
  print('Accuracy of SVC with Kernel {} on test set: {:.2f}'.format(value,svc_classifier.score(feature_test, label_test))) # .score() -> Accuracy
  print(f1_score(label_test, prediction, average='macro'))
  print(precision_score(label_test, prediction, average='macro'))
  print(recall_score(label_test, prediction, average='macro'))

Accuracy of SVC with Kernel linear on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of SVC with Kernel poly on test set: 0.96
0.952136752136752
0.9523809523809524
0.9583333333333334


In [95]:
linearsvc_classifier = LinearSVC()
linearsvc_classifier.fit(feature_train, label_train)
# prediction
prediction = linearsvc_classifier.predict(feature_test)

#hasil
print('Accuracy of LinearSVC on test set: {:.2f}'.format(linearsvc_classifier.score(feature_test, label_test))) # .score() -> Accuracy
print(f1_score(label_test, prediction, average='macro'))
print(precision_score(label_test, prediction, average='macro'))
print(recall_score(label_test, prediction, average='macro'))

Accuracy of LinearSVC on test set: 1.00
1.0
1.0
1.0


Berdasarkan dari hasil di atas menunjukkan bahwa LinearSVC lebih tinggi di bandingkan SVC dengan kernel linear atau poly. 

dapat terjadi karena data iris.csv itu sendiri tidak terlalu rumit/complex, feature nya juga bisa di bilang sedikit dan jumlah data juga terbilang sedikit. maka meskipun hasil akurasi masih bisa terbilang tinggi tetapi untuk algoritma yang kompleks sangat tidak cocok dalam data ini yang hanya memiliki sedikit feature. 

untuk linearSVC meskipun mirip dengan SVC kernel=linear tetapi hasilnya berbeda karena dalam implementasi linearSVC menggunakan library linear sedangkan SVC kernel=linear menggunakan library SVM

# **No. 3 KNN dengan nilai k yang berbeda**

In [96]:
k=[1,3,5,7,9,11,13,15]
for index, value in enumerate(k):
  knn = KNeighborsClassifier(value)
  knn.fit(feature_train, label_train)
  prediction = knn.predict(feature_test)
  print('Accuracy of KNN with k={} on test set: {:.2f}'.format(value,knn.score(feature_test, label_test)))
  print(f1_score(label_test, prediction, average="macro"))
  print(precision_score(label_test, prediction, average="macro"))
  print(recall_score(label_test, prediction, average="macro"))

Accuracy of KNN with k=1 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=3 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=5 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=7 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=9 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=11 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=13 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445
Accuracy of KNN with k=15 on test set: 0.96
0.9500891265597149
0.9629629629629629
0.9444444444444445


mengapa tidak ada perubahan untuk data ini karena pembagian untuk data training dan testingnya dengan random_state=3 sebaran data nya mungkin mirip sehingga tidak terjadi perubahan maka saya definisikan ulang training set dan testing set nya dengan random_state=10 untuk melihat perubahan seperti di bawah ini:

In [98]:
feature_train, feature_test,label_train, label_test = train_test_split(feature_scaled, label, test_size=0.15,random_state=10)

#setting random state nya menjadi 10

In [99]:
k=[1,3,5,7,9,11,13,15]
for index, value in enumerate(k):
  knn = KNeighborsClassifier(value)
  knn.fit(feature_train, label_train)
  prediction = knn.predict(feature_test)
  print('Accuracy of KNN with k={} on test set: {:.2f}'.format(value,knn.score(feature_test, label_test)))
  print(f1_score(label_test, prediction, average="macro"))
  print(precision_score(label_test, prediction, average="macro"))
  print(recall_score(label_test, prediction, average="macro"))

Accuracy of KNN with k=1 on test set: 0.96
0.9547511312217195
0.9523809523809524
0.9629629629629629
Accuracy of KNN with k=3 on test set: 0.96
0.9547511312217195
0.9523809523809524
0.9629629629629629
Accuracy of KNN with k=5 on test set: 0.96
0.9547511312217195
0.9523809523809524
0.9629629629629629
Accuracy of KNN with k=7 on test set: 0.96
0.9547511312217195
0.9523809523809524
0.9629629629629629
Accuracy of KNN with k=9 on test set: 0.91
0.9107142857142857
0.9166666666666666
0.9259259259259259
Accuracy of KNN with k=11 on test set: 0.91
0.9107142857142857
0.9166666666666666
0.9259259259259259
Accuracy of KNN with k=13 on test set: 0.96
0.9547511312217195
0.9523809523809524
0.9629629629629629
Accuracy of KNN with k=15 on test set: 1.00
1.0
1.0
1.0


maka dapat terlihat perubahan bahwa nilai k=15 itu paling baik dibandingkan k dengan nilai lain pada hasil diatas karena menghasilkan akurasi yang paling tinggi pada dataset tersebut.