### Imports necesarios para el notebook

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import (LabelEncoder, MaxAbsScaler, MinMaxScaler,
                                   Normalizer, RobustScaler, StandardScaler)
from sklearn.svm import SVC

### Carga de datos

In [None]:
df = pd.read_csv('bank/bank-full.csv', sep=';')

### Analisis exploratorio de los datos

In [None]:
def count_unique_values_if_categorical(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            print(col, ': ', df[col].nunique())

def eda(df):
    print(df.info())
    print(df.describe())
    count_unique_values_if_categorical(df)
    df.hist(bins=50)
    plt.show()
    print(df.groupby('y').describe())
    print(df.groupby('y').agg(['mean']).unstack().plot(kind='bar'))
    plt.show()

In [None]:
eda(df)

### Preprocesamiento de los datos

In [None]:
# Common Data Preprocessing
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

iso = IsolationForest(contamination=0.05)
X = df.drop('y', axis=1)
outlier = iso.fit_predict(X)
print(len(outlier[outlier == -1]))
df['outlier'] = outlier

print(df.loc[df['outlier'] ==-1].groupby('y').describe())

### Pipeline de entrenamiento

In [None]:
# Función para realizar la validación cruzada para cada combinación
def cross_evaluation(models, scalers, X, y):
    for model in models:
        for scaler in scalers:
            print("Model: ", model, "\tScaler: ", scaler)
            if scaler is not None:
                X_scaled = scaler.fit_transform(X)
            else:
                X_scaled = X
            scores = cross_validate(
                model, X_scaled, y, cv=3, scoring=('accuracy', 'recall'), n_jobs=-1)
            print('Accuracy: ', scores['test_accuracy'].mean())
            print('Recall: ', scores['test_recall'].mean())

In [None]:
# Separar las variables de entrada y la variable de salida
X = df.drop('y', axis=1)
y = df['y']

In [None]:
# Modelos y preprocesadores a utilizar
# Modelos
lr = LogisticRegression(solver='liblinear')
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=200)
svm = SVC()
models = [lr, knn, rf, svm]

# Preprocesadores
scaler = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = MaxAbsScaler()
scaler4 = RobustScaler()
scaler5 = Normalizer()
scalers = [None, scaler, scaler2, scaler3, scaler4, scaler5]

In [None]:
# Realizar la validación cruzada
cross_evaluation(models, scalers, X, y)

### Pipeline de entranamiento con Random Undersampling

In [None]:
# Lectura y procesamiento de nuevo de los datos
df = pd.read_csv('bank/bank-full.csv', sep=';')

for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

X = df.drop('y', axis=1)
y = df['y']

# Realizado de random undersampling
rus = RandomUnderSampler(sampling_strategy='majority')
X, y = rus.fit_resample(X, y)

cross_evaluation(models, scalers, X, y)