## Accuracy(정확도)

In [6]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass 
    
    def predict(self, X):
        pred = np.zeros(( X.shape[0], 1 ))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] = 1
        return pred

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 

# NULL 처리 함수 
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행
def encode_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df 

# 앞에서 설정한 Data Preprocessing 함수 호출 
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = encode_features(df)
    return df

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('./titanic_train.csv')
Y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1, inplace=False)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, Y_train, Y_test = train_test_split(X_titanic_df, Y_titanic_df, test_size=0.2, random_state=11)

myclf = MyDummyClassifier()
myclf.fit(X_train, Y_train)
mypredictions = myclf.predict(X_test)
print(f"더미 클래스파이어의 정확도는 {accuracy_score(Y_test, mypredictions):.4f}")

더미 클래스파이어의 정확도는 0.8324


In [18]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

class MyFakeClassifier(BaseEstimator):
    def fit(self, X, Y):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

digits = load_digits()
print(digits.data)
print(f"\n### digits.data.shape: {digits.data.shape}\n")
print(digits.target)
print(f"\n### digits.target.shape: {digits.target.shape}\n")

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]

### digits.data.shape: (1797, 64)

[0 1 2 ... 8 9 8]

### digits.target.shape: (1797,)



In [22]:
y = (digits.target == 7).astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(digits.data, y, random_state=11)

In [26]:
# 불균형한 레이블 데이터 분포도 확인
print(f"레이블 테스트 세트 크기: {Y_test.shape}")
print(f"레이블 테스트 0과 1의 분포도\n")
print(pd.Series(Y_test).value_counts())

fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, Y_train)
fakepred = fakeclf.predict(X_test)
print(f"\n모든 예측을 0으로 하여도 정확도는: {accuracy_score(Y_test, fakepred)}")

레이블 테스트 세트 크기: (450,)
레이블 테스트 0과 1의 분포도

0    405
1     45
dtype: int64

모든 예측을 0으로 하여도 정확도는: 0.9


## 정밀도(Precision) 와 재현율(Recall)

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [29]:
print(f"정밀도: {precision_score(Y_test, fakepred)}")
print(f"재현율: {recall_score(Y_test, fakepred)}")

정밀도: 0.0
재현율: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### 오차행렬, 정확도, 정밀도, 재현율을 한꺼번에 계산하는 함수 생성

In [31]:
def get_clf_eval(Y_test, pred):
    confusion = confusion_matrix(Y_test, pred)
    accuracy = accuracy_score(Y_test, pred)
    precision = precision_score(Y_test, pred)
    recall = recall_score(Y_test, pred)
    print("오차 행렬")
    print(confusion)
    print(f"정확도: {accuracy:.4f}, 정밀도: {precision:.4f} 재현율: {recall:.4f}")

In [None]:
titanic_df = pd.read_csv('./titanic_train.csv')
Y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1, inplace=False)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, Y_train, Y_test = train_test_split(X_titanic_df, Y_titanic_df, test_size=0.2, random_state=11)

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, Y_train)
pred = lr_clf.