## 1. Imputer 1

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

In [None]:
train_data = [[7, 6, 5],
              [4, np.nan, 5],
              [1, 20, 8]]
test_data = [[np.nan, 1, 2],
             [7, np.nan, 9],
             [np.nan, 2, 4]]

In [None]:
imp_mean = SimpleImputer()
imp_mean.fit(train_data)
print(imp_mean.statistics_, "\n")
np.all(imp_mean.statistics_ == np.nanmean(train_data, axis=0))

In [None]:
print(imp_mean.transform(train_data), "\n")
imp_mean.transform(test_data)

## 2. Scaler

In [None]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_test = np.array([[ 2., -1.,  1.],
                     [ 3.,  3.,  -1.],
                     [ 1.,  1., 1.]])

In [None]:
scaler = StandardScaler()
X_train_transform = scaler.fit_transform(X_train)
X_train_transform

In [None]:
np.mean(X_train_transform, axis=0)

In [None]:
np.std(X_train_transform, axis=0)

In [None]:
scaler.transform(X_test)

## 3. One hot encoder

In [None]:
X_train = [['Python'], ['Java'], ['Java'], ['C++']]
X_test = [['Python'], ['Java'], ['C'], ['C++']]

In [None]:
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X_train)
X_train_transform = enc.transform(X_train)
pd.DataFrame(data=X_train_transform.toarray(), columns=enc.categories_[0])

In [None]:
X_test_transform = enc.transform(X_test)
pd.DataFrame(data=X_test_transform.toarray(), columns=enc.categories_)

## 4. Ordinal Encoder 

In [None]:
X_train = [['good'], ['bad'], ['neutral']]
X_test = [['good'], ['good'], ['bad']]

In [None]:
enc = OrdinalEncoder(categories=[["bad", "neutral", "good"]])
enc.fit(X_train)
enc.transform(X_train)

In [None]:
enc.categories_

In [None]:
enc.transform(X_test)

## 5. Categorical variables

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
# Classe -> 
# 2 = Benign;
# 4 = Malignant

df = pd.read_csv("breast-cancer-wisconsin.data", index_col="Sample code number", 
    na_values=["", "?"],
    names=[
    "Sample code number", "Clump Thickness", "Uniformity of Cell Size",
    "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
    "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class",
])
df

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Sample code number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2.0,1,1,1,2
841769,2,1,1,1,2,1.0,1,1,1,2
888820,5,10,10,3,7,3.0,8,10,2,4
897471,4,8,6,4,3,4.0,10,6,1,4


In [3]:
df.isna().sum()

Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [4]:
df.loc[:, "Bare Nuclei"] = df.loc[:, "Bare Nuclei"].fillna(df.loc[:, "Bare Nuclei"].median())
df.isna().sum()

Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [5]:
df.loc[:, "Class"].value_counts(2)

Class
2    0.655222
4    0.344778
Name: proportion, dtype: float64

In [12]:
X = df.drop(columns=["Class"])

X_train, X_test, y_train, y_test = train_test_split(
    X, df.loc[:, "Class"],
    test_size=0.2, random_state=43, stratify=df.loc[:, "Class"])
print("unique labels", y_train.value_counts(2), "\n")
# y_test.value_counts(2)

unique labels Class
2    0.654741
4    0.345259
Name: proportion, dtype: float64 



In [7]:
log = LogisticRegression()
log.fit(X_train, y_train)

y_preds_train = log.predict(X_train)
y_preds_test = log.predict(X_test)

y_probas_train = log.predict_proba(X_train)
y_probas_test = log.predict_proba(X_test)

print(y_preds_train[:10], "\n")
print(y_probas_train[:10, 1], "\n")

print(y_preds_test[:10], "\n")
y_probas_test[:10, 1]

[4 2 4 2 2 2 2 4 2 2] 

[0.99600525 0.00908951 0.99992741 0.00528932 0.02098032 0.00582947
 0.03566504 0.99515453 0.00788556 0.01065907] 

[2 2 2 4 2 4 2 2 2 4] 



array([0.01747894, 0.22504709, 0.00699085, 0.54019127, 0.00152929,
       0.9986223 , 0.33611399, 0.01228123, 0.00438318, 0.99972361])

In [8]:
print(log.score(X_train, y_train))
print(log.score(X_test, y_test))

0.9695885509838998
0.9642857142857143


In [9]:
confusion_matrix(y_train, y_preds_train)

array([[357,   9],
       [  8, 185]])

In [10]:
confusion_matrix(y_test, y_preds_test)

array([[90,  2],
       [ 3, 45]])

## 6. Pipeline

In [22]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

In [23]:
iris = load_iris()
X = pd.DataFrame(data=iris['data'], columns=iris.feature_names)
y = pd.DataFrame(data=iris['target'], columns=['target'])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    shuffle=True, random_state=43, test_size=0.33,
)

In [25]:
def train(X_train, y_train):
    clf0 = LogisticRegression()
    clf1 = LogisticRegression()
    clf2 = LogisticRegression()

    clf0.fit(X_train, y_train == 0)
    clf1.fit(X_train, y_train == 1)
    clf2.fit(X_train, y_train == 2)

    return clf0, clf1, clf2

In [26]:
clf0, clf1, clf2 = train(X_train, y_train["target"])

In [27]:
def predict_one_vs_all(X, clf0, clf1, clf2):
    p0 = clf0.predict_proba(X)[:, 1]
    p1 = clf1.predict_proba(X)[:, 1]
    p2 = clf2.predict_proba(X)[:, 1]
    probs = np.vstack([p0, p1, p2]).T
    
    classes = np.argmax(probs, axis=1)
    return classes

In [28]:
classes = predict_one_vs_all(X_test, clf0, clf1, clf2)
classes

array([0, 0, 2, 1, 2, 0, 2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0,
       0, 2, 2, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2,
       0, 1, 1, 1, 1, 2])

In [29]:
lr_ovr = OneVsRestClassifier(LogisticRegression())
lr_ovr.fit(X_train, y_train["target"])
y_preds = lr_ovr.predict(X_test)
y_preds

array([0, 0, 2, 1, 2, 0, 2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0,
       0, 2, 2, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2,
       0, 1, 1, 1, 1, 2])

In [30]:
np.all(classes == y_preds)

np.True_