In [1]:
import joblib
import pandas as pd
from feature_engine.selection import ProbeFeatureSelection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../data/MercadoLibre Data Scientist Technical Challenge - Dataset.csv')
print(data.shape)
data.head()

(150000, 19)


Unnamed: 0,a,b,c,d,e,f,g,h,j,k,l,m,n,o,p,fecha,monto,score,fraude
0,4,0.6812,50084.12,50.0,0.0,20.0,AR,1,cat_d26ab52,0.365475,2479.0,952.0,1,,Y,2020-03-20 09:28:19,57.63,100,0
1,4,0.6694,66005.49,0.0,0.0,2.0,AR,1,cat_ea962fb,0.612728,2603.0,105.0,1,Y,Y,2020-03-09 13:58:28,40.19,25,0
2,4,0.4718,7059.05,4.0,0.463488,92.0,BR,25,cat_4c2544e,0.651835,2153.0,249.0,1,Y,Y,2020-04-08 12:25:55,5.77,23,0
3,4,0.726,10043.1,24.0,0.046845,43.0,BR,43,cat_1b59ee3,0.692728,4845.0,141.0,1,N,Y,2020-03-14 11:46:13,40.89,23,0
4,4,0.7758,16584.42,2.0,0.154616,54.0,BR,0,cat_9bacaa5,0.201354,2856.0,18.0,1,Y,N,2020-03-23 14:17:13,18.98,71,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['fraude'], axis=1), # predictive variables
    data['fraude'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((135000, 18), (15000, 18))

In [4]:
fraud_pipeline = joblib.load('../models/feature_engineering_pipeline.joblib')

In [5]:
fraud_pipeline

In [6]:
X_train_transformed = fraud_pipeline.transform(X_train)

In [7]:
X_train_transformed

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,...,fecha_month_end,fecha_quarter_start,fecha_quarter_end,fecha_year_start,fecha_year_end,fecha_leap_year,fecha_days_in_month,fecha_hour,fecha_minute,fecha_second
135569,1.000000,0.5217,0.635969,0.02,1.0,1.0,0.714286,0.620690,0.458599,0.636612,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.391304,0.525424,0.881356
78656,0.333333,0.7554,0.684908,0.02,0.0,1.0,0.428571,0.137931,0.133758,0.633268,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.347826,0.254237,0.288136
87437,1.000000,0.5437,0.741337,0.02,1.0,1.0,0.428571,0.793103,0.458599,0.735751,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.391304,0.050847,0.338983
131674,1.000000,0.7418,0.633959,1.00,1.0,1.0,0.714286,0.155172,0.458599,0.529368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.782609,0.915254,0.101695
45535,1.000000,0.6463,0.693916,0.08,1.0,1.0,0.428571,0.379310,0.458599,0.049208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.913043,0.406780,0.508475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41993,1.000000,0.8063,0.831573,0.06,1.0,0.0,0.714286,0.155172,0.312102,0.164571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.826087,0.067797,0.762712
97639,1.000000,0.5046,0.618473,0.04,0.0,1.0,0.428571,0.155172,0.458599,0.288001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.826087,0.169492,0.186441
95939,1.000000,0.7233,0.686591,0.02,0.0,0.0,0.714286,0.034483,0.866242,0.585850,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.869565,0.372881,0.847458
117952,1.000000,0.7824,0.710351,0.96,1.0,1.0,0.714286,0.086207,0.458599,0.007728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.406780,0.779661


In [8]:
sel = ProbeFeatureSelection(
    estimator=RandomForestClassifier(),
    scoring="roc_auc",
    n_probes=3,
    distribution="all",
    cv=3,
    random_state=150
)

In [9]:
X_tr = sel.fit_transform(X_train_transformed, y_train)

In [10]:
print(X_train_transformed.shape, X_tr.shape)


(135000, 45) (135000, 13)


In [11]:
selected_features = X_tr.columns

In [12]:
pd.Series(selected_features).to_csv('../data/processed/selected_features.csv', index=False)