# 1. PIPELINE

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
german = pd.read_csv("german_credit_data.csv")

In [3]:
german.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
import numpy as np
german['Risk'] = np.where(german['Risk']=='bad', 1, 0)
german.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


In [5]:
german = german[['Age', 'Credit amount', 'Duration', 'Risk']].copy()

In [6]:
german.head()

Unnamed: 0,Age,Credit amount,Duration,Risk
0,67,1169,6,0
1,22,5951,48,1
2,49,2096,12,0
3,45,7882,42,0
4,53,4870,24,1


In [7]:
features = german[['Age', 'Credit amount', 'Duration']]
labels = german['Risk']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=0
)

Sem o pipeline, vc faria o passo a passo, algo do tipo:

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

# scaler = StandardScaler()
# scaler.fit_transform(X_train)


pipe = Pipeline([('scaler', StandardScaler()), 
                 ('imputer', SimpleImputer(strategy='median')),
                 ('clf', DecisionTreeClassifier())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.6166666666666667

In [10]:
pipe.predict(X_test)

array([1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0])

# 2. MAKE_PIPELINE

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

make_pipe = make_pipeline(MinMaxScaler(), 
                     SimpleImputer(strategy='mean'), 
                     LogisticRegression())

make_pipe.fit(X_train, y_train)
make_pipe.score(X_test, y_test)

0.7166666666666667

# 3. COLUMN TRANSFORMER

In [12]:
german = pd.read_csv("german_credit_data.csv")

In [13]:
german['Risk'] = np.where(german['Risk']=='bad', 1, 0)

german = german[['Age', 'Credit amount', 'Duration', 'Purpose','Risk']].copy()

german.head()

Unnamed: 0,Age,Credit amount,Duration,Purpose,Risk
0,67,1169,6,radio/TV,0
1,22,5951,48,radio/TV,1
2,49,2096,12,education,0
3,45,7882,42,furniture/equipment,0
4,53,4870,24,car,1


In [14]:
features = german[['Age', 'Credit amount', 'Duration', 'Purpose']]
labels = german['Risk']

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=0
)

In [15]:
numericas_continuas = ['Age', 'Credit amount', 'Duration']
string_categoricas =['Purpose']

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

t = [('num_continuas', StandardScaler(), numericas_continuas), 
     ('str_categoricas', OneHotEncoder(), string_categoricas)]

preprocessor = ColumnTransformer(transformers=t)

In [17]:
pipe_transformer = Pipeline(steps=[('preprocessor', preprocessor), ('clf', DecisionTreeClassifier())])

In [18]:
pipe_transformer.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_continuas',
                                                  StandardScaler(),
                                                  ['Age', 'Credit amount',
                                                   'Duration']),
                                                 ('str_categoricas',
                                                  OneHotEncoder(),
                                                  ['Purpose'])])),
                ('clf', DecisionTreeClassifier())])

In [19]:
pipe_transformer.score(X_test, y_test)

0.6

# 4. TESTANDO VÁRIOS MODELOS

In [20]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [21]:
classifiers_list = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
]


In [22]:
for classifier in classifiers_list:
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
    pipe.fit(X_train, y_train)
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.690
SVC(C=0.025, probability=True)
model score: 0.713
SVC()
model score: 0.720
LogisticRegression()
model score: 0.697
DecisionTreeClassifier()
model score: 0.613
RandomForestClassifier()
model score: 0.687
AdaBoostClassifier()
model score: 0.720
GradientBoostingClassifier()
model score: 0.710
