<a href="https://colab.research.google.com/github/TheHouseOfVermeulens/wernervermeulen.github.io/blob/master/Creating_Model_with_Pipeline_and_Custom_Transformer_Practise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating Model with Pipeline and Custom Transformer

---



## Load Dataset

In [0]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [0]:
data = fetch_california_housing()

X = pd.DataFrame(data.data, columns=data.feature_names)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [0]:
y = pd.Series(data.target, name='MedPrice')
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedPrice, dtype: float64

## Train-Test Split

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=0)

## Create Custom Transformer

All transformers in scikit-learn follow this template:

```python
class TransName(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        pass
        
    def transform(self, X):
        pass
```

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
import math
import numpy as np

In [0]:
class CityDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sf = (37.77, -122.41)
        self.la = (34.05, -118.24)
        
    def fit(self, X, y=None):
        return self
    
    def haversine(self, coord1, coord2):
        R = 6372800  # Earth radius in meters
        lat1, lon1 = coord1
        lat2, lon2 = coord2

        phi1, phi2 = math.radians(lat1), math.radians(lat2) 
        dphi       = math.radians(lat2 - lat1)
        dlambda    = math.radians(lon2 - lon1)

        a = math.sin(dphi/2)**2 + \
            math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2

        return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))  
    
    def transform(self, X):
        X = X.itertuples(index=False, name=None)
        results = []
        for coord in X:
            results.append((self.haversine(self.sf, coord), 
                            self.haversine(self.la, coord)))
        return np.array(results)

In [0]:
transformer = CityDistance()
X_trans = transformer.fit_transform(X[['Latitude', 'Longitude']])

In [0]:
X[['Latitude', 'Longitude']].head()

Unnamed: 0,Latitude,Longitude
0,37.88,-122.23
1,37.86,-122.22
2,37.85,-122.24
3,37.85,-122.25
4,37.85,-122.25


In [0]:
pd.DataFrame(X_trans, columns=['SF', 'LA']).head()

Unnamed: 0,SF,LA
0,19994.480562,557093.13881
1,19466.096209,554843.351641
2,17387.872199,555174.70528
3,16638.989284,555758.54358
4,16638.989284,555758.54358


## Putting Everything in a Pipeline

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [0]:
ct = ColumnTransformer([
    ('scaler', StandardScaler(), slice(0,6)),
    ('city', CityDistance(), slice(6,8))
])

model = Pipeline([
    ('transfomer', ct),
    ('predictor', LinearRegression())
])

model.fit(X_train, y_train);

In [0]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.5550973930338493
0.5394764492039289


But wait: The features that we created, aren't scaled! How can we fix this?

## Improving our model

In [0]:
from sklearn.preprocessing import PolynomialFeatures

In [0]:
cds = Pipeline([
    ('city', CityDistance()),
    ('scaler', StandardScaler())
])

ct = ColumnTransformer([
    ('scaler', StandardScaler(), slice(0,6)),
    ('city', cds, slice(6,8))
])

model2 = Pipeline([
    ('transfomer', ct),
    ('poly', PolynomialFeatures(degree=2)),
    ('predictor', LinearRegression())
])

model2.fit(X_train, y_train);

In [0]:
print(model2.score(X_train, y_train))

0.6469174985363584
