<a href="https://colab.research.google.com/github/SuccessPear/MLPractices/blob/main/Pipeline/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Authenticate to Kaggle


In [1]:
!mkdir ~/.kaggle

In [6]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [7]:
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 42.2MB/s]


In [9]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Pipeline practice

In [37]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [11]:
df = pd.read_csv("train.csv")

In [12]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [17]:
# Only certain columns so I dont need to deal with NaNs for now
selected_df = df[['MSSubClass',	'MSZoning',	'LotFrontage', 'LotArea',	'Street', 'LotShape',	'LandContour',
        'Utilities', 'MiscVal',	'MoSold',	'YrSold',	'SaleType', 'SalePrice']].dropna()

In [18]:
X = pd.get_dummies(selected_df.drop('SalePrice', axis=1))

In [24]:
y = selected_df.SalePrice

In [25]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,Utilities_AllPub,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,60,65.0,8450,0,2,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,20,80.0,9600,0,5,2007,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,60,68.0,11250,0,9,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,70,60.0,9550,0,2,2006,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,60,84.0,14260,0,12,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [26]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [27]:
pipeline.fit(X, y)

In [28]:
pipeline.predict(X)

array([202992.  , 165150.5 , 220835.5 , ..., 216847.25, 145165.  ,
       150049.  ])

# Save the Pipeline

In [29]:
import pickle

In [30]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [31]:
with open('pipelinemodel.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [32]:
reloaded_model

In [33]:
reloaded_model.predict(X)

array([202992.  , 165150.5 , 220835.5 , ..., 216847.25, 145165.  ,
       150049.  ])

In [36]:
reloaded_model.named_steps['randomforestregressor']

# Using the Pipeline class

In [38]:
custom_pipeline = Pipeline([('scaling', StandardScaler()),
                            ('rfmodel', RandomForestRegressor())])

# Column Transformer

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [75]:
# Categorical features
categorical_features = selected_df.select_dtypes('object').columns
categorical_pipeline = Pipeline([('scaler', OneHotEncoder())])

In [76]:
# Numeric features
numeric_features = selected_df.drop('SalePrice', axis=1).select_dtypes(exclude='object').columns
numeric_pipeline = Pipeline([('onehot', StandardScaler())])

In [77]:
ct = ColumnTransformer([('num', numeric_pipeline, numeric_features),
                        ('cat', categorical_pipeline, categorical_features)])

In [78]:
print(ct)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('onehot', StandardScaler())]),
                                 Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('scaler', OneHotEncoder())]),
                                 Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])


In [79]:
ml_pipeline = Pipeline([('all_column_preprocessing', ct),
                        ('randomforestclassifier', RandomForestRegressor())])

In [80]:
print(ml_pipeline)

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('onehot',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('scaler',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randomforestclassifier', RandomForestRegressor())])


In [81]:
ml_pipeline

In [82]:
X = selected_df.drop('SalePrice', axis=1)
y = selected_df.SalePrice

In [83]:
ml_pipeline.fit(X, y)

In [84]:
ml_pipeline.predict(X)

array([202460.        , 160785.66666667, 214109.        , ...,
       228640.75      , 140979.5       , 152032.5       ])

In [85]:
with open('columntransformermodel.pkl', 'wb') as f:
  pickle.dump(ml_pipeline, f)