<a href="https://colab.research.google.com/github/Seungkyu-Han/colab_ml/blob/main/accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Accuracy (정확도)

In [19]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):

  def fit(self, X, y = None):
    pass

  def predict(self, X):
    pred = np.zeros((X.shape[0], 1))
    for i in range(X.shape[0]):
      if X['Sex'].iloc[i] == 1:
        pred[i] = 0
      else:
        pred[i] = 1

    return pred

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def fillna(df):
  df['Age'] = df['Age'].fillna(df['Age'].mean())
  df['Cabin'] = df['Cabin'].fillna('N')
  df['Embarked'] = df['Embarked'].fillna('N')
  df['Fare'] = df['Fare'].fillna(0)
  return df

def drop_features(df):
  df = df.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)
  return df

def format_features(df):
  df['Cabin'] = df['Cabin'].str[1:]
  features = ['Cabin', 'Sex', 'Embarked']
  for feature in features:
    le = LabelEncoder()
    le = le.fit(df[feature])
    df[feature] = le.transform(df[feature])

  return df

def transform_features(df):
  df = fillna(df)
  df = drop_features(df)
  df = format_features(df)
  return df

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis = 1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size = 0.2, random_state = 0
)

myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

mypredictions = myclf.predict(X_test)
print('Dummy Classifier의 정확도는: {0:.4f}'.format(accuracy_score(y_test, mypredictions)))


Dummy Classifier의 정확도는: 0.7877


In [22]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

class MyFakeClassifier(BaseEstimator):
  def fit(self, X, y):
    pass

  def predict(self, X):
    return np.zeros((len(X), 1), dtype=bool)

digits = load_digits()

print(digits.data)
print('### digits.data.shape:', digits.data.shape)
print(digits.target)
print('### digits.target.shape:', digits.target.shape)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
### digits.data.shape: (1797, 64)
[0 1 2 ... 8 9 8]
### digits.target.shape: (1797,)


In [23]:
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=11)

In [24]:
print('레이블 테스트 세트 크기: ', y_test.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(y_test).value_counts())

fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred = fakeclf.predict(X_test)
print('모든 예측을 0으로 하여도 정확도는: ', accuracy_score(y_test, fakepred))

레이블 테스트 세트 크기:  (450,)
테스트 세트 레이블 0과 1의 분포도
0    405
1     45
Name: count, dtype: int64
모든 예측을 0으로 하여도 정확도는:  0.9


In [25]:
print(y.sum())

179


### Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, fakepred)

array([[405,   0],
       [ 45,   0]])

### 정밀도와 재현율

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("정밀도: ", precision_score(y_test, fakepred))
print("재현율: ", recall_score(y_test, fakepred))

정밀도:  0.0
재현율:  0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Precision Recall trade off

In [28]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

titanic_df = pd.read_csv('train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis = 1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=11
)

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

In [31]:
pred_proba = lr_clf.predict_proba(X_test)
print('pred_proba() 결과 Shape: {0}'.format(pred_proba.shape))
print('pred_proba에서 3개만 추출', pred_proba[:3])

pred_proba() 결과 Shape: (179, 2)
pred_proba에서 3개만 추출 [[0.46770504 0.53229496]
 [0.8805795  0.1194205 ]
 [0.87814077 0.12185923]]


In [33]:
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1, 1)], axis=1)

print(pred_proba_result[:3])

[[0.46770504 0.53229496 1.        ]
 [0.8805795  0.1194205  0.        ]
 [0.87814077 0.12185923 0.        ]]


### Binarizer 활용

In [34]:
from sklearn.preprocessing import Binarizer

X = [
    [1, -1, 2],
    [2, 0, 0],
    [0, 1.1, 1.2]
]

binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(X))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [37]:
# 임계값을 사용해 예측값 반환


custom_threshold = 0.5

pred_proba_1 = pred_proba[:1].reshape(-1, 1)

binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

print(custom_predict)

[[0.]
 [1.]]


In [38]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix

def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy , precision ,recall))

In [39]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
  for custom_threshold in thresholds:
    binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
    custom_predict = binarizer.transform(pred_proba_c1)
    get_clf_eval(y_test, custom_predict)

get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)

오차 행렬
[[98 20]
 [12 49]]
정확도: 0.8212, 정밀도: 0.7101, 재현율: 0.8033
오차 행렬
[[103  15]
 [ 12  49]]
정확도: 0.8492, 정밀도: 0.7656, 재현율: 0.8033
오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659, 정밀도: 0.8246, 재현율: 0.7705
오차 행렬
[[110   8]
 [ 15  46]]
정확도: 0.8715, 정밀도: 0.8519, 재현율: 0.7541
오차 행렬
[[112   6]
 [ 17  44]]
정확도: 0.8715, 정밀도: 0.8800, 재현율: 0.7213


In [42]:
from sklearn.metrics import precision_recall_curve

pred_proba_class1 = lr_clf.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_class1)

print(thresholds.shape)
print(precisions.shape)
print(recalls.shape)

(165,)
(166,)
(166,)


### F1 Score

In [43]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, pred)

print("F1 스코어: {0:.4f}".format(f1))

F1 스코어: 0.7966


In [45]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix

def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)

    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3: .4f}'.format(accuracy , precision ,recall, f1))

thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)

오차 행렬
[[98 20]
 [12 49]]
정확도: 0.8212, 정밀도: 0.7101, 재현율: 0.8033, F1:  0.7538
오차 행렬
[[103  15]
 [ 12  49]]
정확도: 0.8492, 정밀도: 0.7656, 재현율: 0.8033, F1:  0.7840
오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659, 정밀도: 0.8246, 재현율: 0.7705, F1:  0.7966
오차 행렬
[[110   8]
 [ 15  46]]
정확도: 0.8715, 정밀도: 0.8519, 재현율: 0.7541, F1:  0.8000
오차 행렬
[[112   6]
 [ 17  44]]
정확도: 0.8715, 정밀도: 0.8800, 재현율: 0.7213, F1:  0.7928


### ROC Curve와 AUC

In [51]:
from sklearn.metrics import roc_curve
pred_proba_class1 = lr_clf.predict_proba(X_test)[:, 1]

fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)

thr_index = np.arange(1, thresholds.shape[0], 5)

print('샘플을 추출하기 위한 임곗값의 배열: ', thr_index)
print('샘플 index로 추출한 임곗값: ', np.round(thresholds[thr_index], 2))

print('샘플 임곗값별 FPR: ', np.round(fprs[thr_index], 3))
print('샘플 임곗값별 TPR: ', np.round(tprs[thr_index], 3))

샘플을 추출하기 위한 임곗값의 배열:  [ 1  6 11 16 21 26 31 36 41]
샘플 index로 추출한 임곗값:  [0.96 0.65 0.6  0.49 0.38 0.34 0.13 0.12 0.11]
샘플 임곗값별 FPR:  [0.    0.017 0.051 0.102 0.186 0.203 0.619 0.661 0.797]
샘플 임곗값별 TPR:  [0.033 0.656 0.721 0.787 0.836 0.902 0.902 0.967 0.984]


In [52]:
print(thresholds)

[       inf 0.95613009 0.89689672 0.89688814 0.76870551 0.74903889
 0.64756857 0.62916073 0.62580429 0.62577114 0.61277048 0.60286125
 0.57859181 0.56339954 0.53229496 0.4944692  0.49392712 0.48287347
 0.45335139 0.4079414  0.38448715 0.37695163 0.36806119 0.36657112
 0.36371156 0.3558598  0.33997505 0.25488941 0.25272453 0.14879696
 0.14879634 0.12573602 0.12442465 0.12187126 0.12186132 0.12174954
 0.1194205  0.11085484 0.11078555 0.11004311 0.10995133 0.1092364
 0.10260473 0.10260024 0.01189434]
