In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.datetime import DatetimeFeatures
from utils import ScalerDf
from sklearn.pipeline import Pipeline
import joblib
import numpy as np

In [2]:
data = pd.read_csv('../data/MercadoLibre Data Scientist Technical Challenge - Dataset.csv')
print(data.shape)
data.head()

(150000, 19)


Unnamed: 0,a,b,c,d,e,f,g,h,j,k,l,m,n,o,p,fecha,monto,score,fraude
0,4,0.6812,50084.12,50.0,0.0,20.0,AR,1,cat_d26ab52,0.365475,2479.0,952.0,1,,Y,2020-03-20 09:28:19,57.63,100,0
1,4,0.6694,66005.49,0.0,0.0,2.0,AR,1,cat_ea962fb,0.612728,2603.0,105.0,1,Y,Y,2020-03-09 13:58:28,40.19,25,0
2,4,0.4718,7059.05,4.0,0.463488,92.0,BR,25,cat_4c2544e,0.651835,2153.0,249.0,1,Y,Y,2020-04-08 12:25:55,5.77,23,0
3,4,0.726,10043.1,24.0,0.046845,43.0,BR,43,cat_1b59ee3,0.692728,4845.0,141.0,1,N,Y,2020-03-14 11:46:13,40.89,23,0
4,4,0.7758,16584.42,2.0,0.154616,54.0,BR,0,cat_9bacaa5,0.201354,2856.0,18.0,1,Y,N,2020-03-23 14:17:13,18.98,71,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['fraude'], axis=1), # predictive variables
    data['fraude'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((135000, 18), (15000, 18))

## missing indicator

In [4]:
## Vars with na
vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]
indicator = AddMissingIndicator(variables=vars_with_na)
indicator.fit(X_train)
transform_data =indicator.transform(X_train)

## Imputation on numerical vars

In [5]:
# make list of numerical variables
num_vars = [var for var in data.columns if data[var].dtypes != 'O' and 'fraude' not in var]
num_vars_na = [var for var in num_vars if var in vars_with_na]

imputer = MeanMedianImputer(imputation_method='median', variables=num_vars_na)
imputer.fit(transform_data)
transform_data =imputer.transform(transform_data)

## Transformation of numerical vars

In [6]:
log_vars =['c','monto']

In [7]:
logtranformer = LogTransformer(variables=log_vars)
logtranformer.fit(transform_data)
transform_data = logtranformer.transform(transform_data)

### Discretizacion

In [8]:
skewed_vars = ['e', 'f']
discretizer = ArbitraryDiscretiser( binning_dict= dict(e =[-np.inf,0,np.inf], f=[-np.inf,0,np.inf]) )
discretizer.fit(transform_data)
transform_data = discretizer.transform(transform_data)
transform_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,...,monto,score,b_na,c_na,d_na,f_na,g_na,l_na,m_na,o_na
135569,4,0.5217,9.791941,1.0,1,1,BR,36,cat_4744ece,0.63661,...,3.214466,93,0,0,0,0,0,0,0,1
78656,2,0.7554,10.686472,1.0,0,1,AR,8,cat_3203c7c,0.633266,...,3.364188,6,1,1,0,0,0,0,0,1
87437,4,0.5437,11.717906,1.0,1,1,AR,46,cat_5b785c6,0.735749,...,3.106826,55,0,0,0,0,0,0,0,1
131674,4,0.7418,9.755215,50.0,1,1,BR,9,cat_a8c10a4,0.529367,...,2.867899,7,0,0,0,0,0,0,0,1
45535,4,0.6463,10.851127,4.0,1,1,AR,22,cat_edae169,0.049212,...,3.383712,32,0,0,0,0,0,0,0,0


# Transformacion variables categoricas

In [9]:
# capture categorical variables in a list
cat_vars = [var for var in data.columns if data[var].dtypes == 'O' and 'fecha' not in var]
cat_vars_na = [var for var in cat_vars if var in vars_with_na]
categorical_imputer  = CategoricalImputer(variables=cat_vars_na, imputation_method='missing', fill_value='missing')
categorical_imputer.fit(transform_data)
transform_data = categorical_imputer.transform(transform_data)

In [10]:
## Encode rare labels
rarelabel = RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)
rarelabel.fit(transform_data)
transform_data = rarelabel.transform(transform_data)


In [11]:
## ordinal encoders
ordinal_encoder = OrdinalEncoder(variables=cat_vars)
ordinal_encoder.fit(transform_data, y_train)
transform_data = ordinal_encoder.transform(transform_data)

## Datetime Features

In [12]:
dt_features = DatetimeFeatures(variables='fecha', features_to_extract='all')
dt_features.fit(transform_data)
transform_data = dt_features.transform(transform_data)

## Scaler data

In [13]:
scaler = ScalerDf(method='minmax')
scaler.fit(transform_data)
transform_data = scaler.transform(transform_data)

In [14]:
transform_data.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o',
       'p', 'monto', 'score', 'b_na', 'c_na', 'd_na', 'f_na', 'g_na', 'l_na',
       'm_na', 'o_na', 'fecha_month', 'fecha_quarter', 'fecha_semester',
       'fecha_year', 'fecha_week', 'fecha_day_of_week', 'fecha_day_of_month',
       'fecha_day_of_year', 'fecha_weekend', 'fecha_month_start',
       'fecha_month_end', 'fecha_quarter_start', 'fecha_quarter_end',
       'fecha_year_start', 'fecha_year_end', 'fecha_leap_year',
       'fecha_days_in_month', 'fecha_hour', 'fecha_minute', 'fecha_second'],
      dtype='object')

# Pongamos todo junto

In [15]:
pipeline_steps = [
    ('missing_indicator',AddMissingIndicator(variables=vars_with_na)),
    ('numerical_imputer', MeanMedianImputer(imputation_method='median', variables=num_vars_na)),
    ('categorical_imputer', CategoricalImputer(variables=cat_vars_na, imputation_method='missing', fill_value='missing')),
    ('numerical_transformation', LogTransformer(variables=log_vars)),
    ('binarizer', ArbitraryDiscretiser( binning_dict= dict(e =[-np.inf,0,np.inf], f=[-np.inf,0,np.inf]))),
    ('rare_label_encoder', RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)),
    ('ordinal_encoder', OrdinalEncoder(variables=cat_vars)),
    ('datetime_features', DatetimeFeatures(variables='fecha', features_to_extract='all')),
    ('scaler', ScalerDf(method='minmax'))
    
]

In [16]:
fraud_pipeline = Pipeline(pipeline_steps)

In [17]:
fraud_pipeline

In [18]:
fraud_pipeline.fit(X_train, y_train)

In [19]:
fraud_pipeline.transform(X_train)

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,...,fecha_month_end,fecha_quarter_start,fecha_quarter_end,fecha_year_start,fecha_year_end,fecha_leap_year,fecha_days_in_month,fecha_hour,fecha_minute,fecha_second
135569,1.000000,0.5217,0.635969,0.02,1.0,1.0,0.714286,0.620690,0.458599,0.636612,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.391304,0.525424,0.881356
78656,0.333333,0.7554,0.684908,0.02,0.0,1.0,0.428571,0.137931,0.133758,0.633268,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.347826,0.254237,0.288136
87437,1.000000,0.5437,0.741337,0.02,1.0,1.0,0.428571,0.793103,0.458599,0.735751,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.391304,0.050847,0.338983
131674,1.000000,0.7418,0.633959,1.00,1.0,1.0,0.714286,0.155172,0.458599,0.529368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.782609,0.915254,0.101695
45535,1.000000,0.6463,0.693916,0.08,1.0,1.0,0.428571,0.379310,0.458599,0.049208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.913043,0.406780,0.508475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41993,1.000000,0.8063,0.831573,0.06,1.0,0.0,0.714286,0.155172,0.312102,0.164571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.826087,0.067797,0.762712
97639,1.000000,0.5046,0.618473,0.04,0.0,1.0,0.428571,0.155172,0.458599,0.288001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.826087,0.169492,0.186441
95939,1.000000,0.7233,0.686591,0.02,0.0,0.0,0.714286,0.034483,0.866242,0.585850,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.869565,0.372881,0.847458
117952,1.000000,0.7824,0.710351,0.96,1.0,1.0,0.714286,0.086207,0.458599,0.007728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.406780,0.779661


In [20]:
joblib.dump(fraud_pipeline, '../models/feature_engineering_pipeline.joblib')

['../models/feature_engineering_pipeline.joblib']