## 1. Imputer 1

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

In [2]:
train_data = [[7, 6, 5],
              [4, np.nan, 5],
              [1, 20, 8]]
test_data = [[np.nan, 1, 2],
             [7, np.nan, 9],
             [np.nan, 2, 4]]

In [3]:
imp_mean = SimpleImputer()
imp_mean.fit(train_data)
print(imp_mean.statistics_, "\n")
np.all(imp_mean.statistics_ == np.nanmean(train_data, axis=0))

[ 4. 13.  6.] 



np.True_

In [4]:
imp_mean.transform(train_data)


array([[ 7.,  6.,  5.],
       [ 4., 13.,  5.],
       [ 1., 20.,  8.]])

In [5]:
imp_mean.transform(test_data)

array([[ 4.,  1.,  2.],
       [ 7., 13.,  9.],
       [ 4.,  2.,  4.]])

## 2. Scaler

In [6]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_test = np.array([[ 2., -1.,  1.],
                     [ 3.,  3.,  -1.],
                     [ 1.,  1., 1.]])

In [7]:
scaler = StandardScaler()
X_train_transform = scaler.fit_transform(X_train)
X_train_transform

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [8]:
np.mean(X_train_transform, axis=0)

array([0., 0., 0.])

In [9]:
np.std(X_train_transform, axis=0)

array([1., 1., 1.])

In [10]:
scaler.transform(X_test)

array([[ 1.22474487, -1.22474487,  0.53452248],
       [ 2.44948974,  3.67423461, -1.06904497],
       [ 0.        ,  1.22474487,  0.53452248]])

## 3. One hot encoder

In [11]:
X_train = [['Python'], ['Java'], ['Java'], ['C++']]
X_test = [['Python'], ['Java'], ['C'], ['C++']]

In [12]:
enc = OneHotEncoder(handle_unknown="ignore", dtype=int)
enc.fit(X_train)
X_train_transform = enc.transform(X_train)
pd.DataFrame(data=X_train_transform.toarray(), columns=enc.categories_[0])

Unnamed: 0,C++,Java,Python
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0


In [13]:
X_test_transform = enc.transform(X_test)
pd.DataFrame(data=X_test_transform.toarray(), columns=enc.categories_)

Unnamed: 0,C++,Java,Python
0,0,0,1
1,0,1,0
2,0,0,0
3,1,0,0


## 4. Ordinal Encoder 

In [14]:
X_train = [['good'], ['bad'], ['neutral']]
X_test = [['good'], ['good'], ['bad']]

In [15]:
enc = OrdinalEncoder(categories=[["bad", "neutral", "good"]])
enc.fit(X_train)
enc.transform(X_train)

array([[2.],
       [0.],
       [1.]])

In [16]:
enc.categories_

[array(['bad', 'neutral', 'good'], dtype=object)]

In [17]:
enc.transform(X_test)

array([[2.],
       [2.],
       [0.]])

## 5. Categorical variables

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.compose import make_column_transformer


In [25]:
df = pd.read_csv("breast-cancer.csv", 
    names=["age", "menopause", "tumor_size", 
           "inv-nodes", "node-caps", "deg-malig", 
           "breast", "breast-quad", "irradiat", "Class"],
    #na_values=["?", ""]
)
df

Unnamed: 0,age,menopause,tumor_size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
...,...,...,...,...,...,...,...,...,...,...
281,50-59,ge40,30-34,6-8,yes,2,left,left_low,no,no-recurrence-events
282,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes,no-recurrence-events
283,30-39,premeno,30-34,6-8,yes,2,right,right_up,no,no-recurrence-events
284,50-59,premeno,15-19,0-2,no,2,right,left_low,no,no-recurrence-events


In [26]:
print(df.isna().sum(), "\n")
df.dropna(inplace=True)
df.isna().sum()

age            0
menopause      0
tumor_size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
Class          0
dtype: int64 



age            0
menopause      0
tumor_size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
Class          0
dtype: int64

In [28]:
X = df.drop(columns=["Class"])
y = df["Class"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=43, test_size=0.2,
)

In [38]:
# for col in X.columns:
#     unique_values = X[col].unique()
#     print(f"{col}:{len(unique_values)}\n")
X_train.nunique()

age             6
menopause       3
tumor_size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64

In [32]:
ohe = OneHotEncoder()
ohe_cols = ["node-caps", "breast", "breast-quad", "irradiat"]
ohe.fit(X_train.loc[:, ohe_cols])
ohe.transform(X_test[ohe_cols][:10]).toarray()

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.]])

In [39]:
# ohe.get_feature_names_out(ohe_cols)

In [40]:
oe_cols = ["menopause", "age", "tumor_size","inv-nodes", "deg-malig"]
categories = [
    ["lt40", "premeno", "ge40"], #menopause"
    ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'], #age
    ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', #tumor-size
     '35-39', '40-44', '45-49', '50-54', '55-59'],
    ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', #inv-nodes
     '24-26', '27-29', '30-32', '33-35', '36-39'],
    [1, 2, 3], #deg-malig
]

In [41]:
oe = OrdinalEncoder(categories=categories)
oe.fit(X_train.loc[:, oe_cols])
oe.transform(X_test.loc[:, oe_cols])[:10]

array([[2., 5., 2., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 4., 5., 2.],
       [1., 4., 5., 1., 1.],
       [2., 5., 5., 0., 2.],
       [1., 2., 1., 0., 1.],
       [1., 2., 8., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 5., 0., 2.],
       [1., 2., 3., 0., 0.]])

In [44]:
column_transformer = make_column_transformer(
    (ohe, ohe_cols),
    (oe, oe_cols),
    remainder="passthrough"
)
column_transformer.fit(X_train)
column_transformer.transform(X_test)[:2]

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 1.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 0.]])

## 6. Pipeline

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

In [None]:
iris = load_iris()
X = pd.DataFrame(data=iris['data'], columns=iris.feature_names)
y = pd.DataFrame(data=iris['target'], columns=['target'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    shuffle=True, random_state=43, test_size=0.33,
)

In [None]:
def train(X_train, y_train):
    clf0 = LogisticRegression()
    clf1 = LogisticRegression()
    clf2 = LogisticRegression()

    clf0.fit(X_train, y_train == 0)
    clf1.fit(X_train, y_train == 1)
    clf2.fit(X_train, y_train == 2)

    return clf0, clf1, clf2

In [None]:
clf0, clf1, clf2 = train(X_train, y_train["target"])

In [None]:
def predict_one_vs_all(X, clf0, clf1, clf2):
    p0 = clf0.predict_proba(X)[:, 1]
    p1 = clf1.predict_proba(X)[:, 1]
    p2 = clf2.predict_proba(X)[:, 1]
    probs = np.vstack([p0, p1, p2]).T
    
    classes = np.argmax(probs, axis=1)
    return classes

In [None]:
classes = predict_one_vs_all(X_test, clf0, clf1, clf2)
classes

In [None]:
lr_ovr = OneVsRestClassifier(LogisticRegression())
lr_ovr.fit(X_train, y_train["target"])
y_preds = lr_ovr.predict(X_test)
y_preds

In [None]:
np.all(classes == y_preds)