In [39]:
import numpy as np
import pandas as pd

In [40]:
url = ("http://hbiostat.org/data/repo/titanic3.xls")

df = pd.read_excel(url)


### Classification Pipeline

In [6]:
from sklearn.base import(
    BaseEstimator,
    TransformerMixin)
import pandas as pd
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
from sklearn.pipeline import Pipeline

In [13]:
def tweak_titanic(df):
    df = df.drop(columns=["name",
                      "ticket",
                      "home.dest",
                      "boat",
                      "body",
                      "cabin"]
        ).pipe(pd.get_dummies, drop_first=True)
    return df

class TitanicTransformer(
    BaseEstimator, TransformerMixin):
    def transform(self, X):
        # assumes X is output
        # from reading Excel file
        X = tweak_titanic(X)
        X = X.drop(columns="survived")
        return X
    def fit(self, X, y):
        return self

pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        ("rf", RandomForestClassifier())
    ])

In [14]:
from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df,
    df.survived,
    test_size=0.3,
    random_state=42)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7989821882951654

In [15]:
params = {
    "rf__max_features":[0.4, "auto"],
    "rf__n_estimators": [15,200]
}

grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=params)

grid.fit(df, df.survived)

6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\timry\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\timry\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\timry\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\timry\anaconda3\Lib\site-packages\sklearn\base.py", line

In [16]:
grid.best_params_

{'rf__max_features': 0.4, 'rf__n_estimators': 200}

In [17]:
pipe.set_params(**grid.best_params_)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7989821882951654

In [20]:
from sklearn import metrics
metrics.roc_auc_score(y_test2, pipe.predict(X_test2))

0.7880653000845309

### Regression Pipeline

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import (
    model_selection,
    preprocessing,
)


import pandas as pd
import numpy as np
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# raw_df has the data in 2 rows for each real row of data
# the first of the 2 rows contains all data needed for 11 columns
# the second of the 2 rows only contains needed data for 3 columns
# hstack takes these two buckets of data and combines then horizontally into 1 long row of columns
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :3]])
feature_names = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT", "MEDV"]
df = pd.DataFrame(data, columns=feature_names)

bos_X = df.drop(["MEDV"], axis=1)
bos_y = df["MEDV"]

bos_X_train, bos_X_test, bos_y_train, bos_y_test = model_selection.train_test_split(
    bos_X,
    bos_y,
    test_size=0.3,
    random_state=42,
)
bos_sX = preprocessing.StandardScaler().fit_transform(
    bos_X
)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = model_selection.train_test_split(
    bos_sX,
    bos_y,
    test_size=0.3,
    random_state=42,
)

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
reg_pipe = Pipeline(
    [
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("lr", LinearRegression()),
    ]
)
reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)

0.7112260057484934

In [26]:
reg_pipe.named_steps["lr"].intercept_

23.01581920903955

In [27]:
reg_pipe.named_steps["lr"].coef_

array([-1.10834602,  0.80843998,  0.34313466,  0.81386426, -1.79804295,
        2.913858  , -0.29893918, -2.94251148,  2.09419303, -1.44706731,
       -2.05232232,  1.02375187, -3.88579002])

In [28]:
from sklearn import metrics

metrics.mean_squared_error(
    bos_y_test, reg_pipe.predict(bos_X_test))

21.517444231177194

### PCA Pipeline

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        ("std", StandardScaler()),
        ("pca",PCA())
    ])

X_pca = pca_pipe.fit_transform(df, df.survived)

In [45]:
pca_pipe.named_steps["pca"].explained_variance_ratio_

array([0.23843437, 0.21766138, 0.19207432, 0.10460781, 0.08254178,
       0.07218454, 0.05099774, 0.04149805])

In [46]:
pca_pipe.named_steps["pca"].components_[0]

array([ 0.63591201, -0.39601222,  0.00210876, -0.10899407, -0.58278256,
        0.19349714,  0.19275661,  0.11258023])