In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

### Importing Data

In [4]:
df = pd.read_csv('supervised_ml_homework_data.csv', delimiter = ',')

In [5]:
df

Unnamed: 0,initial_rate,length,width,height,speed,y
0,0.12,5.87,7.31,9.61,24.47,0
1,0.68,5.49,5.46,9.19,20.33,0
2,1.86,6.03,5.89,10.75,22.60,0
3,0.04,5.25,5.57,10.04,20.00,0
4,0.28,6.63,6.17,9.90,21.74,0
...,...,...,...,...,...,...
4995,2.00,3.97,8.12,10.11,28.39,0
4996,1.39,5.84,5.41,6.83,20.90,0
4997,-0.00,4.39,5.38,12.19,19.48,0
4998,0.09,5.29,6.38,11.75,22.08,0


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_train, df_test = train_test_split(df, 
                                     test_size = 0.25, random_state = 1) 

In [8]:
X_train_data = df_train.loc[:, df_train.columns!='y']
y_train_data = df_train.loc[:, df_train.columns=='y']

X_test_data = df_test.loc[:, df_test.columns!='y']
y_test_data = df_test.loc[:, df_test.columns=='y']

### Dealing With Imbalanced Data

In [9]:
# import random undersampling library 
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# summarize class distribution
print(Counter(y_train_data['y']))

# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority')

# fit and apply the transform
X_train_under, y_train_under = undersample.fit_resample(X_train_data, 
                                                        y_train_data)

# summarize class distribution
print(Counter(y_train_under['y']))

Counter({0: 3371, 1: 379})
Counter({0: 379, 1: 379})


### Soal

Lakukan KNN, Logistic Regression, dan Decision Tree pada dataset yang telah diberikan.

Jawablah:
1. Jika kita menganggap '1' sebagai 'positive', maka algoritma apa yang memiliki nilai F1-Score terbaik?
2. Pada algoritma 'knn', berapa 'n_neighbors' yang ideal jika kita ingin memaksimalkan nilai 'recall'? 
3. Jika kita menganggap '1' sebagai 'positive', maka algoritma apa yang memiliki nilai F1-Score terburuk?

In [10]:
### Silakan Mengerjakan

### Jawaban

In [11]:
def evaluate(model, x_test = X_test_data, y_test = y_test_data):
    y_pred = model.predict(x_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('------')
    print('Confusion Matrix:')
    print()
    print(classification_report(y_test, y_pred))
    print('------')
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('Assuming 1 as positive, we get:')
    print('')
    print('True Positive:', tp)
    print('True Negative:', tn)
    print('False Positive:', fp)
    print('False Negative:', fn)
    
    print('')
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1-Score:', 2 * (precision * recall)/(precision + recall))

### K-Nearest Neighbor

In [12]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_under, y_train_under)

KNeighborsClassifier()

In [13]:
y_prediction = knn.predict(X_test_data)

In [14]:
y_prediction

array([0, 0, 0, ..., 0, 0, 1])

In [15]:
evaluate(knn)

Accuracy: 0.8752
------
Confusion Matrix:

              precision    recall  f1-score   support

           0       0.99      0.87      0.93      1130
           1       0.43      0.96      0.60       120

    accuracy                           0.88      1250
   macro avg       0.71      0.91      0.76      1250
weighted avg       0.94      0.88      0.89      1250

------
Assuming 1 as positive, we get:

True Positive: 115
True Negative: 979
False Positive: 151
False Negative: 5

Precision: 0.4323308270676692
Recall: 0.9583333333333334
F1-Score: 0.5958549222797928


i will try `n_neighbors` random value from 3 to 15 and see if we can improve the accuracy.

Let's try to find the `n_neighbors` that give us the best `recall`.

In [16]:
def evaluate_recall(model, x_test = X_test_data, y_test = y_test_data):
    y_pred = model.predict(x_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    return recall

In [17]:
recall_dict = {
    'n':[],
    'recall':[]
}
for n in [3,5,7,9,11,13,15,18,20,22,25,30,32,35,37,40]:
    knn = KNeighborsClassifier(n_neighbors = n)
    knn.fit(X_train_under, y_train_under)
    recall = evaluate_recall(knn)
    recall_dict['n'].append(n)
    recall_dict['recall'].append(recall)
    
recall_df = pd.DataFrame(recall_dict)

In [18]:
recall_df

Unnamed: 0,n,recall
0,3,0.916667
1,5,0.958333
2,7,0.958333
3,9,0.958333
4,11,0.958333
5,13,0.966667
6,15,0.958333
7,18,0.958333
8,20,0.958333
9,22,0.966667


### Logistic Regression

In [19]:
logreg = LogisticRegression()
logreg.fit(X_train_under, y_train_under)

LogisticRegression()

In [20]:
evaluate(logreg)

Accuracy: 0.8968
------
Confusion Matrix:

              precision    recall  f1-score   support

           0       1.00      0.89      0.94      1130
           1       0.48      0.97      0.64       120

    accuracy                           0.90      1250
   macro avg       0.74      0.93      0.79      1250
weighted avg       0.95      0.90      0.91      1250

------
Assuming 1 as positive, we get:

True Positive: 116
True Negative: 1005
False Positive: 125
False Negative: 4

Precision: 0.48132780082987553
Recall: 0.9666666666666667
F1-Score: 0.6426592797783934


### Decision Tree

In [21]:
dt = DecisionTreeClassifier()
dt.fit(X_train_under, y_train_under)
evaluate(dt)

Accuracy: 0.8888
------
Confusion Matrix:

              precision    recall  f1-score   support

           0       0.99      0.89      0.94      1130
           1       0.46      0.89      0.61       120

    accuracy                           0.89      1250
   macro avg       0.72      0.89      0.77      1250
weighted avg       0.94      0.89      0.90      1250

------
Assuming 1 as positive, we get:

True Positive: 107
True Negative: 1004
False Positive: 126
False Negative: 13

Precision: 0.4592274678111588
Recall: 0.8916666666666667
F1-Score: 0.6062322946175638


###Jawaban

1. F1-Score terbaik didapatkan oleh Logistic Regression
2. Pada KNN, n_neighbors 22, 25, 30, 32, 35, 37, 40 memungkinkan kita mendapat nilai recall tertinggi
3. F1-Score terburuk didapatkan oleh K-Nearest Neighbor 