In [81]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression

In [50]:
class DummyClassifier(BaseEstimator) :

    def fit(self, x, y=None) :
        pass

    def predict(self, x) :
        pred = np.zeros( (x.shape[0], 1))
        for i in range(x.shape[0]) :
            if x['Sex'].iloc[i] == 1 :
                pred[i] = 0
            else :
                pred[i] = 1
        return pred

In [52]:
# Modify code for not defined values
def fill_na(df) :
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# Delete code that is unnecessary code
def drop_features(df) :
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# Execute label encoding
def format_features(df) :
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features :
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# Call the function
def transform_features(df) :
    df = fill_na(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [53]:
# Data reload from raw data to meta data
titan_df = pd.read_csv('../1장/titanic/train.csv')
y_titan_df = titan_df['Survived']
x_titan_df = titan_df.drop('Survived', axis=1)
x_titan_df = transform_features(x_titan_df)
x_train, x_test, y_train, y_test = train_test_split(x_titan_df, y_titan_df, test_size=0.2)

clf = DummyClassifier()
clf.fit(x_train, y_train)

predictions = clf.predict(x_test)
print('Accuracy score : {0}'.format(accuracy_score(y_test, predictions)))

Accuracy score : 0.8156424581005587


In [54]:
class FakeClassifier(BaseEstimator) :

    def fit(self, x, y) :
        pass

    def predict(self, x) :
        return np.zeros( (len(x) ,1 ) , dtype=bool )

digits = load_digits()
print(digits.data)
print(digits.target)
print(digits.feature_names)

# Find image that is number 7
digits.target == 7

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
[0 1 2 ... 8 9 8]
['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_

array([False, False, False, ..., False, False, False])

In [55]:
# Change type from boolean to integer
y = (digits.target == 7).astype(int)
x_train, x_test, y_train, y_test = train_test_split(digits.data, y)

print(y_test.shape)
print(pd.Series(y_test).value_counts())

clf = FakeClassifier()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)

print('Accuracy score : {0}'.format(accuracy_score(y_test, pred)))

(450,)
0    410
1     40
dtype: int64
Accuracy score : 0.9111111111111111


In [56]:
# Confusion matrix (FN, TN, FP, TP)
confusion_matrix(y_test, pred)

array([[410,   0],
       [ 40,   0]], dtype=int64)

In [57]:
# Precision, Recall
print('Precision : ', precision_score(y_test, pred))
print('Recall : ', recall_score(y_test, pred))

Precision :  0.0
Recall :  0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
def get_clf_eval(y_test, pred) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print(confusion)
    print('Accuracy : {0}, Precision : {1} , Recall : {2}'.format(accuracy, precision, recall))

In [60]:
# Logistic regression

x_train, x_test, y_train, y_test = train_test_split(x_titan_df, y_titan_df, test_size=0.2)

clf = LogisticRegression()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
get_clf_eval(y_test, pred)

[[87 23]
 [16 53]]
Accuracy : 0.7821229050279329, Precision : 0.6973684210526315 , Recall : 0.7681159420289855


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [74]:
# Precision / Recall : Trade-off
pred_prob = clf.predict_proba(x_test)
pred = clf.predict(x_test)
print('pred proba shape : ', pred_prob.shape)
# [ 0 (negative ratio) 1 (positive ratio) ]
print(pred_prob[:3])
print('pred shape : ', pred.shape)

# Combine the two arrays into columns
pred_prob_result = np.concatenate([pred_prob, pred.reshape(-1,1)], axis=1)
print(pred_prob_result[:3])

pred proba shape :  (179, 2)
[[0.20732631 0.79267369]
 [0.89201768 0.10798232]
 [0.91374176 0.08625824]]
pred shape :  (179,)
[[0.20732631 0.79267369 1.        ]
 [0.89201768 0.10798232 0.        ]
 [0.91374176 0.08625824 0.        ]]


In [80]:
# Binarizer

x = [[1, -1, 2],
     [2, 0, 0],
     [0, 1.1, 1.2]]

# Threshold returns 0 if equal to or less than the reference value and 1 if greater
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(x))

# Standard threshold is 0.5
custom_threshold = 0.5
pred_prob_nd = pred_prob[:, 1].reshape(-1, 1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_prob_nd)
custom_pred = binarizer.transform(pred_prob_nd)

get_clf_eval(y_test, custom_pred)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]
[[87 23]
 [16 53]]
Accuracy : 0.7821229050279329, Precision : 0.6973684210526315 , Recall : 0.7681159420289855


In [86]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred, thresholds) :
    for custom_threshold in thresholds :
        binarizer = Binarizer(threshold=custom_threshold).fit(pred)
        custom_pred = binarizer.transform(pred)
        print('Threshold : ', custom_threshold)
        get_clf_eval(y_test, custom_pred)

get_eval_by_threshold(y_test, pred_prob[:, 1].reshape(-1, 1), thresholds)


Threshold :  0.4
[[84 26]
 [15 54]]
Accuracy : 0.770949720670391, Precision : 0.675 , Recall : 0.782608695652174
Threshold :  0.45
[[84 26]
 [15 54]]
Accuracy : 0.770949720670391, Precision : 0.675 , Recall : 0.782608695652174
Threshold :  0.5
[[87 23]
 [16 53]]
Accuracy : 0.7821229050279329, Precision : 0.6973684210526315 , Recall : 0.7681159420289855
Threshold :  0.55
[[90 20]
 [16 53]]
Accuracy : 0.7988826815642458, Precision : 0.726027397260274 , Recall : 0.7681159420289855
Threshold :  0.6
[[94 16]
 [20 49]]
Accuracy : 0.7988826815642458, Precision : 0.7538461538461538 , Recall : 0.7101449275362319


In [97]:
pred_prob_class = clf.predict_proba(x_test)[:, 1]
# print(pred_prob_class.reshape(-1,1))

precision, recalls, thresholds = precision_recall_curve(y_test, pred_prob_class)
print(thresholds.shape)
print(precision.shape)
print(recalls.shape)

print(thresholds[:5])
print(precision[:5])
print(recalls[:5])


(162,)
(163,)
(163,)
[0.07476971 0.07479386 0.07566252 0.08131544 0.08625824]
[0.40350877 0.4        0.40236686 0.4047619  0.40718563]
[1.         0.98550725 0.98550725 0.98550725 0.98550725]
