In [56]:
# 2023 OCT 30

In [57]:
import sklearn
import numpy as np
import pandas as pd

# prepare data

## example #1: titanic dataset

* functions for pre-processing

In [58]:
from sklearn.preprocessing import LabelEncoder

# drop
def drop_features(df, features):
    df.drop(features, axis=1, inplace=True)
    
# encode
def encode_features(df, features):
    encoder = LabelEncoder()
    for feature in features:
        encoder.fit(df[feature])
        df[feature] = encoder.transform(df[feature])
        
# null
def fill_nulls(df, instructions):
    for feature, method in instructions.items():
        if method == "N":
            df[feature].fillna("N", inplace=True)
        elif method == "mean":
            df[feature].fillna(df[feature].mean(), inplace=True)
        else:
            print("<!> unknown method")
            
# combined
def preprocess_features(df, features_to_drop, instructions_to_fill_null, features_to_encode):
    drop_features(df, features_to_drop)
    fill_nulls(df, instructions_to_fill_null)
    
    print("number of nulls in dataset:", df.isnull().sum().sum())
    
    encode_features(df, features_to_encode)

* load, extract, and pre-process data

In [59]:
# read & load
titanic_df = pd.read_csv("../data/titanic_train.csv")

feature_names = ["PaxID", "Survived", "Class", "Name", "Sex", "Age", "SibSpo", "ParChi", "Ticket", "Fare", "Cabin", "PortEmba"]
titanic_df.columns = feature_names

# extract features and targets
y = titanic_df["Survived"]
X = titanic_df.drop("Survived", axis=1, inplace=False)

# pre-process
features_to_drop = ["PaxID", "Name", "Ticket"]
instructions_to_fill_null = {"Age": "mean", "Cabin": "N", "PortEmba": "N"}
features_to_encode = ["Cabin", "Sex", "PortEmba"]

preprocess_features(X, features_to_drop, instructions_to_fill_null, features_to_encode)

display(X)

number of nulls in dataset: 0


Unnamed: 0,Class,Sex,Age,SibSpo,ParChi,Fare,Cabin,PortEmba
0,3,1,22.000000,1,0,7.2500,146,3
1,1,0,38.000000,1,0,71.2833,81,0
2,3,0,26.000000,0,0,7.9250,146,3
3,1,0,35.000000,1,0,53.1000,55,3
4,3,1,35.000000,0,0,8.0500,146,3
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,146,3
887,1,0,19.000000,0,0,30.0000,30,3
888,3,0,29.699118,1,2,23.4500,146,3
889,1,1,26.000000,0,0,30.0000,60,0


## example #2: MNIST dataset

* load, extract, and pre-process data

In [60]:
# read & load
from sklearn.datasets import load_digits
digits = load_digits()

print(digits)

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]]), 'target': array([0, 1, 2, ..., 8, 9, 8]), 'frame': None, 'feature_names': ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', '

# train, predict, evaluation

* load metrics

In [61]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

## example #1: titanic dataset

* design a dummy classifier to simulate accuracy metric

In [62]:
from sklearn.base import BaseEstimator

class My_Dummy_Classifier(BaseEstimator):
    # this dummy will learn nothing
    def fit(self, X, y=None):
        pass
    
    # this dummy will classify 
    # if male (Sex is 1) then not survived (Survived is 0)
    # if female (Sex is 0) then survived (Survived is 1)
    def predict(self, X):
        N = X.shape[0]
        predictions = np.zeros((N, 1))
        for i in range(N):
            if X["Sex"].iloc[i] == 1:
                predictions[i] = 0
            else:
                predictions[i] = 1
        
        return predictions

* divide dataset

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 8) (179, 8) (712,) (179,)


* train & predict

In [64]:
classifier = My_Dummy_Classifier()

# train
classifier.fit(X_train, y_train)  # will learn nothing

# predict
predictions = classifier.predict(X_test)

* evaluation

In [65]:
# accuracy
print(f"accuracy: {accuracy_score(y_test, predictions):.4f}", "\n")

# confusion matrix
print("confusion matrix:\n", confusion_matrix(y_test, predictions), "\n")

# precision & recall
print(f"precision: {precision_score(y_test, predictions):.4f}")
print(f"recall   : {recall_score(y_test, predictions):.4f}")

accuracy: 0.7877 

confusion matrix:
 [[92 18]
 [20 49]] 

precision: 0.7313
recall   : 0.7101


## example #2: MNIST dataset

* design a dummy classifier to simulate accuracy metric

In [66]:
from sklearn.base import BaseEstimator

class My_Dummy_Denier(BaseEstimator):
    # this dummy will learn nothing
    def fit(self, X, y=None):
        pass
    
    # this dummy will deny whatever it receives    
    def predict(self, X):
        N = len(X)
        predictions = np.zeros((N, 1), dtype=bool)
        
        return predictions

* design a new target (binary, boolean)

In [67]:
# leave features as they are
X = digits.data

# suppose target is a boolean list,
# representing whether the detected number is 7 or not
y = (digits.target == 7).astype(int)

* divide dataset

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1437, 64) (360, 64) (1437,) (360,)


In [69]:
# check test target
pd.Series(y_test).value_counts()

0    324
1     36
dtype: int64

* train & predict

In [70]:
classifier = My_Dummy_Denier()

# train
classifier.fit(X_train, y_train)  # will learn nothing

# predict
predictions = classifier.predict(X_test)

* evaluation

In [71]:
# accuracy
print(f"accuracy: {accuracy_score(y_test, predictions):.4f}", "\n")

# confusion matrix
print("confusion matrix:\n", confusion_matrix(y_test, predictions), "\n")

# precision & recall
print(f"precision: {precision_score(y_test, predictions):.4f}")
print(f"recall   : {recall_score(y_test, predictions):.4f}")

accuracy: 0.9000 

confusion matrix:
 [[324   0]
 [ 36   0]] 

precision: 0.0000
recall   : 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
