In [9]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.datetime import DatetimeFeatures
from utils import ScalerDf
from sklearn.pipeline import Pipeline
import joblib
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
data = pd.read_csv('../data/raw/car_raw.csv', index_col=0)
data['year_created' ] = data['_created'].apply(lambda x: x[:4]).astype(int)
data['antique'] = data['year_created'] - data['years']
final_vars = ['price', 'antique', 'vehicle_make', 'vehicle_line', 'kilometraje', 'location_city', 'location_state']
data = data[final_vars]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['price'], axis=1), # predictive variables
    data['price'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((2927, 6), (326, 6))

## Transformation of numerical vars

In [16]:
log_vars =['antique','kilometraje']
logtranformer = LogTransformer(variables=log_vars)
logtranformer.fit(data)
transform_data = logtranformer.transform(data)

### Discretizacion

In [8]:
skewed_vars = ['e', 'f']
discretizer = ArbitraryDiscretiser( binning_dict= dict(e =[-np.inf,0,np.inf], f=[-np.inf,0,np.inf]) )
discretizer.fit(transform_data)
transform_data = discretizer.transform(transform_data)
transform_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,...,monto,score,b_na,c_na,d_na,f_na,g_na,l_na,m_na,o_na
135569,4,0.5217,9.791941,1.0,1,1,BR,36,cat_4744ece,0.63661,...,3.214466,93,0,0,0,0,0,0,0,1
78656,2,0.7554,10.686472,1.0,0,1,AR,8,cat_3203c7c,0.633266,...,3.364188,6,1,1,0,0,0,0,0,1
87437,4,0.5437,11.717906,1.0,1,1,AR,46,cat_5b785c6,0.735749,...,3.106826,55,0,0,0,0,0,0,0,1
131674,4,0.7418,9.755215,50.0,1,1,BR,9,cat_a8c10a4,0.529367,...,2.867899,7,0,0,0,0,0,0,0,1
45535,4,0.6463,10.851127,4.0,1,1,AR,22,cat_edae169,0.049212,...,3.383712,32,0,0,0,0,0,0,0,0


# Transformacion variables categoricas

In [21]:
# capture categorical variables in a list
cat_vars = [var for var in data.columns if data[var].dtypes == 'O' and 'fecha' not in var]


In [26]:
## Encode rare labels
rarelabel = RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)
rarelabel.fit(X_train)
transform_data = rarelabel.transform(X_train)


In [27]:
## ordinal encoders
ordinal_encoder = OrdinalEncoder(variables=cat_vars)
ordinal_encoder.fit(transform_data, y_train)
transform_data = ordinal_encoder.transform(transform_data)

## Scaler data

In [29]:
scaler = ScalerDf(method='minmax')
scaler.fit(transform_data)
transform_data = scaler.transform(transform_data)

In [30]:
transform_data.columns

Index(['antique', 'vehicle_make', 'vehicle_line', 'kilometraje',
       'location_city', 'location_state'],
      dtype='object')

# Pongamos todo junto

In [31]:
pipeline_steps = [

    ('rare_label_encoder', RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)),
    ('ordinal_encoder', OrdinalEncoder(variables=cat_vars)),
    ('scaler', ScalerDf(method='minmax'))
    
]
fraud_pipeline = Pipeline(pipeline_steps)
fraud_pipeline.fit(X_train, y_train)

In [35]:
fraud_pipeline.transform(X_train)

Unnamed: 0,antique,vehicle_make,vehicle_line,kilometraje,location_city,location_state
305,0.142857,0.829268,0.883721,0.077349,0.84375,0.25
981,0.028571,0.292683,0.401163,0.008766,0.68750,0.25
1084,1.000000,0.292683,0.087209,0.002678,0.43750,0.25
3100,0.128571,0.707317,0.691860,0.108336,0.68750,0.25
3095,0.142857,0.560976,0.831395,0.212637,0.15625,0.25
...,...,...,...,...,...,...
763,0.071429,0.463415,0.191860,0.067564,0.68750,0.25
835,0.028571,0.926829,0.982558,0.025777,0.90625,0.75
1653,0.157143,0.317073,0.284884,0.122315,0.43750,0.25
2762,0.128571,0.121951,0.063953,0.079213,0.68750,0.25


In [38]:
model = joblib.dump(fraud_pipeline, '../models/feature_engineering_pipeline.joblib')

In [39]:
model

['../models/feature_engineering_pipeline.joblib']