In [101]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from feature_engine.outliers import Winsorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from data import final_features, winsorize_variables
pd.set_option('display.max_columns',50)

### Preprocess
In this section preprocessing the created features will be applied in order to quit NA's, treat outliers and select most important features

In [102]:
def get_preprocessor(
    X: pd.DataFrame,
    y: pd.Series,
    winsorize_variables,
    final_features,
    n_estimators: int=100,
    max_features: int=None,
    random_state: int=102
    
):
    steps = [
        ('zero_imputer', SklearnTransformerWrapper(
            SimpleImputer(
                missing_values=np.nan, strategy='constant', fill_value=0
                ), variables=final_features))
        ,('max_winsorizer', Winsorizer(
            variables=winsorize_variables, capping_method='iqr', tail='right', 
            fold=3))
    ]
    # Running initial pipeline
    initial_pipeline = Pipeline(steps, verbose=True)
    initial_pipeline.fit(X)
    
    X_preprocessed = initial_pipeline.transform(X)
    
    # Fetaure selector
    feature_selector = SelectFromModel(
        RandomForestClassifier(
            n_estimators=n_estimators, random_state=random_state
            ),
            max_features=max_features, threshold=0
            )
    feature_selector.fit(X_preprocessed, y)
    column_names = feature_selector.get_feature_names_out().tolist()
    
    # Defining final preprocess pipeline
    steps = [
            ('preprocessor', initial_pipeline),
            ('selector', feature_selector)
        ]

    # Running preprocess pipeline
    preprocessor = Pipeline(steps, verbose=True)
    
    return preprocessor, column_names


###### Define some parameters to preprocess data

In [103]:
data = pd.read_csv('data/features_preprocessed.csv')
time_split = True
test_size = 0.2
random_state = 102
target = 'target'

###### I will implement an Out Of Time train/test split, which is more suitable for this type of problem. In this approach, we train our model on data from a specific time period and subsequently evaluate its performance on new data. Therefore, an out-of-time partitioning strategy is a more effective approach for handling time-dependent scenarios.

In [104]:
if time_split:
    data['APPLICATION_DATETIME'] = pd.to_datetime(data['APPLICATION_DATETIME'])
    data = data.sort_values(by='APPLICATION_DATETIME', ascending=True)
    train, test = data.iloc[:int(data.shape[0] * (1 - test_size)), :], \
                        data.iloc[int(data.shape[0] * (1 - test_size)):, :]
else:
    train, test = train_test_split(data, test_size=test_size, random_state=random_state)

    
y_train = train[target]
X_train = train[final_features]

y_test = test[target]
X_test = test[final_features]


###### Create the preprocessor with the function i defined above

In [105]:
# Creating preprocess pipeline
preprocessor, columns_names = get_preprocessor(
            X=X_train, y=y_train, winsorize_variables=winsorize_variables,
            final_features=final_features, random_state=random_state,
            max_features=75
            )

[Pipeline] ...... (step 1 of 2) Processing zero_imputer, total=   0.1s
[Pipeline] .... (step 2 of 2) Processing max_winsorizer, total=   0.0s


###### Save the preprocessor

In [106]:
joblib.dump(preprocessor, 'models/preprocessor.joblib.dat')

['models/preprocessor.joblib.dat']

###### Preprocess the train/test data and save it

In [107]:
# Saving train set
train_preprocessed = pd.DataFrame(
    preprocessor.transform(X_train), columns=columns_names
    )
train_preprocessed[target] = y_train.values
train_preprocessed.to_csv('data/train_data.csv', index=False)

In [108]:
# Saving train set
test_preprocessed = pd.DataFrame(
    preprocessor.transform(X_test), columns=columns_names
    )
test_preprocessed[target] = y_test.values
test_preprocessed.to_csv('data/test_data.csv', index=False)