# Refactor Taxi Fare Prediction Problem with a Pipeline

Refactor the model you built for the Taxi Fare Prediction Problem using:
- Custom encoders you have to write for distance and time features
- OneHot Encoder to encoder hour and day of week features
- SimpleImputer to fill missing values
- A simple linear regression
- A pipeline to put all together


Then: 
- train this pipeline
- apply the pipeline on test data
- generate predictions and submit these new predictions to Kaggle

## First pipeline

In [1]:
# import the dataset from s3 bucket 
import pandas as pd
url = "s3://wagon-public-datasets/taxi-fare-train.csv"

# Select only 10 000 rows while creating the DataFrame
df = pd.read_csv(url, nrows=10_000)

In [2]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
def clean_data(df, test=False):
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 0]
    df = df[df["pickup_latitude"].between(40, 42)]
    df = df[df["pickup_longitude"].between(-74.3, -72.9 )]
    df = df[df["dropoff_latitude"].between(40, 42)]
    df = df[df["dropoff_longitude"].between(-74, -72.9)]
    return df

df_cleaned = clean_data(df)
"% data removed", (1 - len(df_cleaned) / len(df)) * 100

('% data removed', 14.670000000000005)

In [4]:
df_cleaned.shape

(8533, 8)

In [5]:
# prepare X and y
X = df.drop(columns=['fare_amount'])
y = df.fare_amount

In [6]:
# Hold out 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [7]:
X_train

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2744,2014-12-12 18:19:00.00000045,2014-12-12 18:19:00 UTC,-73.998272,40.740547,-74.005137,40.729625,6
6667,2009-10-01 12:27:26.0000003,2009-10-01 12:27:26 UTC,-73.981468,40.759025,-73.978246,40.748316,3
1338,2013-08-19 12:37:00.00000069,2013-08-19 12:37:00 UTC,-73.963882,40.768085,-73.954878,40.777527,1
2243,2013-12-23 21:51:01.0000002,2013-12-23 21:51:01 UTC,-73.789774,40.643895,-73.928265,40.763598,4
2031,2011-02-06 02:55:48.0000004,2011-02-06 02:55:48 UTC,-73.962517,40.804990,-73.997475,40.743902,1
...,...,...,...,...,...,...,...
774,2012-04-19 19:11:35.0000005,2012-04-19 19:11:35 UTC,-73.863763,40.769938,-74.000320,40.716240,1
9913,2014-06-11 18:08:00.00000061,2014-06-11 18:08:00 UTC,-73.982270,40.768310,-73.983500,40.781980,2
4699,2010-05-24 09:48:00.000000188,2010-05-24 09:48:00 UTC,-73.992110,40.742553,-73.977010,40.738837,1
9936,2013-05-18 16:16:15.0000001,2013-05-18 16:16:15 UTC,-73.957611,40.776553,-73.975976,40.752850,1


### Custom transformers

With the Taxi Fare Prediction Challenge data, using `BaseEstimator` and `TransformerMixin`, implement:

- a transformer that computes haversine distance between pickup and dropoff location
- a custom encoder that extract time features from `pickup_datetime`

In [8]:
import numpy as np

def haversine_vectorized(df, 
         start_lat="pickup_latitude",
         start_lon="pickup_longitude",
         end_lat="dropoff_latitude",
         end_lon="dropoff_longitude"):

    """ 
        Calculate the great circle distance between two points 
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df
        Computes distance in kms
    """

    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [9]:
# Implement the `tarnsform`  method of the DistanceTransformer
from sklearn.base import BaseEstimator,  TransformerMixin 

class DistanceTransformer(BaseEstimator, TransformerMixin):
    """Compute the haversine distance between two GPS points."""

    def __init__(self, 
                 start_lat="pickup_latitude",
                 start_lon="pickup_longitude", 
                 end_lat="dropoff_latitude", 
                 end_lon="dropoff_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only one column: 'distance'"""
        X_r = X.copy()
        X_r["distance"] = haversine_vectorized(X_r)
        
        return X_r[["distance"]]

In [10]:
# test the DistanceTransformer
dist_trans = DistanceTransformer()
distance = dist_trans.fit_transform(X_train, y_train)
distance.head()

Unnamed: 0,distance
2744,1.34518
6667,1.221323
1338,1.295061
2243,17.704613
2031,7.40309


In [11]:
# Implement the `transform` method of the TimeFeaturesEncoder
class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """Extract the day of week (dow), the hour, the month and the year from a time column."""

    def __init__(self, time_column, time_zone_name='America/New_York'):
        self.time_column = time_column
        self.time_zone_name = time_zone_name
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only four columns: 'dow', 'hour', 'month', 'year'"""
        df = X.copy()
        timezone_name = 'America/New_York'
        time_column = "pickup_datetime"
        df.index = pd.to_datetime(df[time_column])
        df.index = df.index.tz_convert(timezone_name)
        df["dow"] = df.index.weekday
        df["hour"] = df.index.hour
        df["month"] = df.index.month
        df["year"] = df.index.year
        return df[['dow','hour', 'month', 'year' ]].reset_index(drop=True)

In [12]:
# test the TimeFeaturesEncoder
time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

Unnamed: 0,dow,hour,month,year
0,4,13,12,2014
1,3,8,10,2009
2,0,8,8,2013
3,0,16,12,2013
4,5,21,2,2011


###  Prepocessing pipeline

In [13]:
# visualizing pipelines in HTML
from sklearn import set_config; set_config(display='diagram')

#### Distance pipeline

Create a pipeline for distances:
- convert pickup and dropoff coordinates into distances with the DistanceTransformer
- standardize these distances

In [14]:
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [15]:
# create distance pipeline
pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())

# display distance pipeline
pipe_distance

#### Time features pipeline

Create a pipeline for time features
- extract time features from pickup datetime with the TimeFeaturesEncoder
- encode these categorical time features with the OneHotEncoder

In [16]:
# create time pipeline
pipe_time = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

# display time pipeline

pipe_time

#### Preprocessing pipeline

Wrap up the distance pipeline and the time pipeline into a preprocesssing pipeline.

In [17]:
# create preprocessing pipeline
dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
time_cols = ['pickup_datetime']


preproc = ColumnTransformer([('time', pipe_time, time_cols),
                                  ('distance', pipe_distance, dist_cols)])

# display preprocessing pipeline
preproc

### Model pipeline

Create a pipeline containing the preprocessing and the regression model of your choice.

In [18]:
# Add the model of your choice to the pipeline
pipe = Pipeline([('preproc', preproc),
                ('regressor', RandomForestRegressor())])
# display the pipeline with model
pipe

<details>
    <summary>
       💡 Hint
    </summary>
The pipeline should look like
<img src='img/pipeline.png'>
</details>

### Training and performance

Train the pipelined model and compute prediction on the test set:

In [19]:
# Train the pipelined model
pipe.fit(X_train, y_train)

# compute y_pred on the test set
y_pred = pipe.predict(X_test)

Use the RMSE to evaluate the model's performance:

In [20]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true)**2).mean())

In [21]:
# call compute_rmse
compute_rmse(y_pred, y_test)

5.518177945875526

## Complete workflow with a pipeline

Here we will implement the whole workflow for our Taxifare kaggle challenge.  

For that we will refactor code in functions for more clarity.  

Implement following functions:  
- `get_data()` to fetch data from local path
- `clean_data()` to clean data
- `get_pipeline()` to get the pipeline defined earlier
- `train()` to train our model
- `evaluate()` to evaluate our model on test data

In [22]:
# implement get_data() function
def get_data(nrows=10000):
    '''returns a DataFrame with nrows from s3 bucket'''
    url = "s3://wagon-public-datasets/taxi-fare-train.csv"
    df = pd.read_csv(url, nrows=10000)
    
    return df

In [23]:
#implement clean_data() function
def clean_data(df, test=False):
    '''returns a DataFrame without outliers and missing values'''
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 0]
    df = df[df["pickup_latitude"].between(40, 42)]
    df = df[df["pickup_longitude"].between(-74.3, -72.9 )]
    df = df[df["dropoff_latitude"].between(40, 42)]
    df = df[df["dropoff_longitude"].between(-74, -72.9)]
    return df

In [24]:
# implement set_pipeline() function
def set_pipeline():
    '''returns a pipelined model'''
    
    # create pipeline
    pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())
    pipe_time = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

    
    dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
    time_cols = ['pickup_datetime']

    preproc = ColumnTransformer([('time', pipe_time, time_cols),
                                      ('distance', pipe_distance, dist_cols)])

    pipe = Pipeline([('preproc', preproc),
                ('regressor', RandomForestRegressor())])
    
    return pipe

In [25]:
#implement train() function
def train(X_train, y_train, pipeline):
    '''returns a trained pipelined model'''
    
    return pipe.fit(X_train, y_train)

In [29]:
#implement evaluate() function
def evaluate(X_test, y_test, pipeline):
    '''prints and returns the value of the RMSE'''
    
    y_pred = pipe.predict(X_test)
    return np.sqrt(((y_pred - y_test)**2).mean())

### Test the complete worflow

Use the above functions to test the complete workflow.

In [30]:
# store the data in a DataFrame
df = get_data(nrows=10_000)

# set X and y
df = clean_data(df)
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']

# hold out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# build pipeline
pipe = set_pipeline()

# train the pipeline
pipe_train = train(X_train, y_train, pipe)

# evaluate the pipeline
score = evaluate(X_test, y_test, pipe_train)

In [31]:
score

4.4798186497635735

### Congrats!

Now we are ready to convert this complete workflow into a packaged code 🚀