<a href="https://colab.research.google.com/github/PaletteofDesign/pipeline/blob/main/pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [3]:
f = '/content/abalone.data'
df = pd.read_csv(f, header=None)
df[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
# Rings +1.5 to get age in years
df.rename(columns={0:'Sex', 1:'Length', 2:'Diameter', 3:'Height', 4:'Whole_Weight', 5:'Shucked_Weight', 6:'Viscera_Weight', 7:'Shell_Weight', 8:'Rings'}, inplace=True)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

In [6]:
X = df.loc[:,['Length', 'Diameter', 'Height', 'Whole_Weight', 'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight']]
X[0:5]

Unnamed: 0,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [7]:
y = df['Rings']
y[:5]

0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
reg_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

In [10]:
reg_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsregressor',
                 KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                     metric='minkowski', metric_params=None,
                                     n_jobs=None, n_neighbors=5, p=2,
                                     weights='uniform'))],
         verbose=False)

In [11]:
print("Training R2:", reg_pipe.score(X_train, y_train))
print("Testing R2:", reg_pipe.score(X_test, y_test))

Training R2: 0.6557365549173233
Testing R2: 0.472919481762452


In [12]:
X = df[['Length', 'Diameter', 'Height', 'Whole_Weight', 'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight', 'Rings']]
y = df['Sex']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
class_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
class_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [15]:
print("Training Accuracy:", class_pipe.score(X_train, y_train))
print("Testing Accuracy:", class_pipe.score(X_test, y_test))

Training Accuracy: 0.679757343550447
Testing Accuracy: 0.5473684210526316


1. Since KNN uses scaling and pipline incorperates scaling they benifit each other.

2. Other models that use scaling would benifit from using pipline. This will make models easier to duplicate and helps streamline models.