In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline # the pipeline library
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

In [2]:
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# load dataset
pima_df = pd.read_csv('pima-indians-diabetes.data', names= colnames)

# check head
pima_df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# independant variable
X = pima_df.drop('class',axis=1)

# dependant variable
y = pima_df['class']

In [4]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.30, random_state=7)

In [5]:
# it takes a list of tuples as parameter. The last entry is the call to the modelling algorithm
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', LogisticRegression())
])

In [6]:
# use the pipeline object as you would
# a regular classifier
pipeline.fit(X_train,y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression())])

In [7]:
y_predict = pipeline.predict(X_test) # predict using pipeline

model_score = pipeline.score(X_test, y_test) # score using pipeline

print(model_score)
print(confusion_matrix(y_test, y_predict))

0.7748917748917749
[[130  17]
 [ 35  49]]


In [8]:
pipeline.steps # steps undertaken in pipeline

[('scaler', StandardScaler()), ('clf', LogisticRegression())]