# Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
import pickle

pd.set_option('display.max_columns', None)


In [2]:
df = pd.read_parquet('../data/processed/full_valid.parquet.gzip')

In [3]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,year_month_day,daily_preciptation_normal_inches,max_temperature_normal_f,min_temperature_normal_f,avg_temperature_normal_f,BIKE_LANE,BOROCODE,ST_WIDTH,TRAFDIR,RW_TYPE,SNOW_PRI,PRE_DIRECT,PRE_TYPE,POST_TYPE,BIKE_TRAFD,SEGMENT_TY,day_of_week,hour_of_day,period_of_day,pickup_cluster,dropoff_cluster
0,id1718206,1,2016-01-05 10:07:14,1,-73.990273,40.766647,-73.992729,40.748363,N,20160105,0.12,40,29,35,NULL_VALUE,1,44.0,FT,2,C,NULL_VALUE,NULL_VALUE,AVE,NULL_VALUE,R,1,10,morning,0,0
1,id1748564,2,2016-01-07 14:38:51,1,-73.976311,40.759819,-73.957603,40.779598,N,20160107,0.12,40,29,34,1,1,60.0,FT,1,C,NULL_VALUE,AVE,NULL_VALUE,FT,U,3,14,afternoon,0,3
2,id3792886,1,2016-05-29 22:11:39,2,-74.00676,40.749096,-73.972206,40.757252,N,20160529,0.14,75,59,67,NULL_VALUE,1,54.0,FT,2,C,NULL_VALUE,NULL_VALUE,AVE,NULL_VALUE,R,6,22,evening,0,0
3,id2541269,2,2016-03-01 12:33:26,5,-73.982689,40.782124,-73.95649,40.781673,N,20160301,0.13,45,32,39,NULL_VALUE,1,32.0,FT,2,C,NULL_VALUE,NULL_VALUE,PKWY,NULL_VALUE,R,1,12,afternoon,0,3
4,id3152266,2,2016-04-14 18:52:23,1,-73.802071,40.663708,-73.982391,40.766552,N,20160414,0.13,61,45,53,NULL_VALUE,4,36.0,FT,2,C,NULL_VALUE,NULL_VALUE,EXPY,NULL_VALUE,U,3,18,evening,1,0


# Features

In [4]:
features_to_remove = yaml.safe_load(open('../src/data/config/features.yml', 'r'))['hard_remove']

In [18]:
def get_features_type(X, n_categories, features_to_remove):
    features = {'categorical': [], 'numerical': []}

    for column in set(X.columns) - set(features_to_remove):
        unique = X[column].nunique()

        if unique <= n_categories and X[column].dtype in ['object', 'string', 'categorical']:
            features['categorical'].append(column)
        else:
            features['numerical'].append(column)

    return features

In [19]:
features = get_features_type(df, 15, features_to_remove)

# Ordinal Encoder

In [20]:
ord_encoder = OrdinalEncoder(encoding_method='arbitrary',
                             variables=features['categorical'],
                             missing_values='ignore',
                             ignore_format=True)

In [21]:
ord_encoder.fit(df)

In [28]:
ord_encoder.transform(df.head()).drop(columns=features_to_remove)

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,daily_preciptation_normal_inches,max_temperature_normal_f,min_temperature_normal_f,avg_temperature_normal_f,BIKE_LANE,BOROCODE,ST_WIDTH,TRAFDIR,RW_TYPE,SNOW_PRI,PRE_DIRECT,PRE_TYPE,POST_TYPE,BIKE_TRAFD,SEGMENT_TY,day_of_week,hour_of_day,period_of_day,pickup_cluster,dropoff_cluster
0,1,1,0,0.12,40,29,35,0,0,44.0,0,0,0,0,0,0,0,0,1,10,0,0,0
1,2,1,0,0.12,40,29,34,1,0,60.0,0,1,0,0,1,1,1,1,3,14,1,0,3
2,1,2,0,0.14,75,59,67,0,0,54.0,0,0,0,0,0,0,0,0,6,22,2,0,0
3,2,5,0,0.13,45,32,39,0,0,32.0,0,0,0,0,0,2,0,0,1,12,1,0,3
4,2,1,0,0.13,61,45,53,0,1,36.0,0,0,0,0,0,3,0,1,3,18,2,1,0


# One Hot Encoder

In [23]:
one_encoder = OneHotEncoder(drop_last=True, 
                            drop_last_binary=True,
                            variables=features['categorical'],
                            ignore_format=True)

In [24]:
one_encoder.fit(df)

In [27]:
one_encoder.transform(df.head())

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year_month_day,daily_preciptation_normal_inches,max_temperature_normal_f,min_temperature_normal_f,avg_temperature_normal_f,ST_WIDTH,day_of_week,hour_of_day,pickup_cluster,dropoff_cluster,store_and_fwd_flag_N,PRE_TYPE_NULL_VALUE,PRE_DIRECT_NULL_VALUE,PRE_DIRECT_W,PRE_DIRECT_N,PRE_DIRECT_E,POST_TYPE_AVE,POST_TYPE_NULL_VALUE,POST_TYPE_PKWY,POST_TYPE_EXPY,POST_TYPE_ST,POST_TYPE_DR,POST_TYPE_PL,POST_TYPE_TER,POST_TYPE_RD,POST_TYPE_BLVD,POST_TYPE_CONC,BIKE_TRAFD_NULL_VALUE,BIKE_TRAFD_FT,BIKE_TRAFD_TF,period_of_day_morning,period_of_day_afternoon,period_of_day_evening,TRAFDIR_FT,TRAFDIR_TF,TRAFDIR_TW,TRAFDIR_NULL_VALUE,SNOW_PRI_C,SNOW_PRI_S,SNOW_PRI_H,SNOW_PRI_V,SEGMENT_TY_R,SEGMENT_TY_U,SEGMENT_TY_T,SEGMENT_TY_E,RW_TYPE_2,RW_TYPE_1,RW_TYPE_3,RW_TYPE_9,RW_TYPE_NULL_VALUE,BOROCODE_1,BOROCODE_4,BOROCODE_3,BOROCODE_2,BIKE_LANE_NULL_VALUE,BIKE_LANE_1,BIKE_LANE_3,BIKE_LANE_2
0,id1718206,1,2016-01-05 10:07:14,1,-73.990273,40.766647,-73.992729,40.748363,20160105,0.12,40,29,35,44.0,1,10,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,id1748564,2,2016-01-07 14:38:51,1,-73.976311,40.759819,-73.957603,40.779598,20160107,0.12,40,29,34,60.0,3,14,0,3,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0
2,id3792886,1,2016-05-29 22:11:39,2,-74.00676,40.749096,-73.972206,40.757252,20160529,0.14,75,59,67,54.0,6,22,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
3,id2541269,2,2016-03-01 12:33:26,5,-73.982689,40.782124,-73.95649,40.781673,20160301,0.13,45,32,39,32.0,1,12,0,3,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
4,id3152266,2,2016-04-14 18:52:23,1,-73.802071,40.663708,-73.982391,40.766552,20160414,0.13,61,45,53,36.0,3,18,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
