In [1]:
import numpy as np
from sklearn.base import BaseEstimator

### 모든 여성에 대해 생존했다고 예측하는 클래스

In [2]:
class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass

    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X['sex'].iloc[i] == 0:
                pred[i, 0] == 1

        return pred

In [3]:
import pandas as pd
titanic = pd.read_csv('../00. data/titanic/train.csv')[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
titanic['sex'] = np.nan
for i in titanic.index:
    titanic['sex'][i] = 1 if titanic['Sex'][i] == 'male' else 0
for i in titanic.index:
    titanic['Age'][i] = titanic['Age'][i] * 100 if titanic['Age'][i] < 1 else titanic['Age'][i]
titanic['Age'] = titanic['Age'].fillna(round(titanic['Age'].mean(), 0))
titanic['Embarked'] = titanic['Embarked'].fillna('C')
for i in titanic.index:
    if titanic.Embarked[i] == 'S':
        titanic.Embarked[i] = 0
    elif titanic.Embarked[i] == 'C':
        titanic.Embarked[i] = 1
    else:
        titanic.Embarked[i] = 2
del titanic['PassengerId']
del titanic['Sex']
t_df = pd.get_dummies(titanic)
t_df_label = titanic['Survived']
t_df_data = titanic.drop('Survived', axis=1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(t_df_data, t_df_label, test_size=0.2, random_state=0)
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)

In [5]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, my_pred)

0.6145251396648045

### MNIST 손글씨

In [8]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [6]:
from sklearn.datasets import load_digits

digits = load_digits()

y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=11)

In [7]:
print('레이블 테스트 세트 크기:', y_test.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(y_test).value_counts())

레이블 테스트 세트 크기: (450,)
테스트 세트 레이블 0과 1의 분포도
0    405
1     45
dtype: int64


In [9]:
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred = fakeclf.predict(X_test)
score = accuracy_score(y_test, fakepred)
print(f'모든 예측을 0으로 하여도 정확도는 {score:.3f}')

모든 예측을 0으로 하여도 정확도는 0.900


### 오차 행렬(Confusion Matrix)

In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, fakepred)

array([[405,   0],
       [ 45,   0]], dtype=int64)

### 정밀도(precision)와 재현율(recall)

In [12]:
from sklearn.metrics import precision_score, recall_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, F1 스코어: {f1:.4f}')

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [19]:
titanic_df = pd.read_csv('../00. data/titanic/train.csv') 
y_titanic_df = titanic_df['Survived'] 
X_titanic_df = titanic_df.drop('Survived', axis=1) 
X_titanic_df = transform_features(X_titanic_df) 
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [21]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차 행렬
[[104  14]
 [ 13  48]]
정확도: 0.8492, 정밀도: 0.7742, 재현율: 0.7869


### Precision/Recall Trade-off

In [22]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:10, :]

array([[0.4616653 , 0.5383347 ],
       [0.87862763, 0.12137237],
       [0.87727002, 0.12272998],
       [0.88283621, 0.11716379],
       [0.85508952, 0.14491048],
       [0.88231157, 0.11768843],
       [0.88838988, 0.11161012],
       [0.20916926, 0.79083074],
       [0.78258628, 0.21741372],
       [0.36993909, 0.63006091]])

In [23]:
pred[:10]

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64)

### F1 스코어

In [24]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, pred)
print(f'F1 스코어: {f1:.4f}')

F1 스코어: 0.7805


### ROC AUC 스코어

In [25]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, pred)
print(f'ROC AUC 스코어: {roc_auc:.4f}')

TypeError: Singleton array array(0.7804878) cannot be considered a valid collection.