# En este notebook se encuentras las funciones utilizadas a lo largo del proyecto
<span style="font-size:Large;">       
Se ha instalado el siguiente paquete para cargar las funciones entre notebooks:<br>    
<br> 
    
```python
! pip install nbimporter
```
<br>     
</span>

# Paquetes

In [1]:
# Básicos:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from joblib import dump, load

# Procesado de datos:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Model selection:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, cross_val_score, cross_val_predict

# Modelos de Clasificación:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Red Neuronal:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

# Métricas:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.metrics import Precision
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve

# Tiempo:
import tqdm




# Funciones para Outliers

In [2]:
def outliers(variable):
    '''
    Fución para obtener el límite superior e inferior tras calcular el rango intercuartílico.
    '''
    Q1 = variable.quantile(q = 0.25)
    Q3 = variable.quantile(q = 0.75)

    # Rango intercuartil (IQR)
    IQR = Q3 - Q1

    # Calcular los limites inferior y superior
    lim_inf = Q1 - 1.5 * IQR
    lim_sup = Q3 + 1.5 * IQR
    
    return lim_inf, lim_sup

In [3]:
def metodo_tukey(df, columna, alfa):
    q1 = df[columna].quantile(0.25)
    q3 = df[columna].quantile(0.75)
    riq = q3 - q1

    df = df[df[columna].between(q1 - alfa * riq, q3 + alfa * riq) | (df[columna].isna())]
    
    return df

# Función transformación logarítmica

In [4]:
def logartimo_base2(df, columna):
    df[columna] = df[columna].apply(lambda x : np.log2(x+1))
    
    return df

# Función procesamiento columnas

In [5]:
def procesamiento_columnas(df):
    
    df = metodo_tukey(df, 'MSinceOldestTradeOpen', 3)

    df = logartimo_base2(df, 'MSinceMostRecentTradeOpen')
    df = metodo_tukey(df, 'MSinceMostRecentTradeOpen', 2)

    df = metodo_tukey(df, 'AverageMInFile', 3)

    df = metodo_tukey(df, 'NumSatisfactoryTrades', 3)

    df = logartimo_base2(df, 'NumTrades90Ever_DerogPubRec')

    df = logartimo_base2(df, 'PercentTradesNeverDelq')
    df = metodo_tukey(df, 'PercentTradesNeverDelq', 4)

    df = metodo_tukey(df, 'NumTotalTrades', 3)

    df = metodo_tukey(df, 'NumTradesOpeninLast12M', 3)

    df = logartimo_base2(df, 'MSinceMostRecentInqexcl7days')

    df = logartimo_base2(df, 'NumInqLast6M')
    df = metodo_tukey(df, 'NumInqLast6M', 2)

    df = metodo_tukey(df, 'NetFractionRevolvingBurden', 3)

    df = metodo_tukey(df, 'NetFractionInstallBurden', 3)

    df = metodo_tukey(df, 'NumRevolvingTradesWBalance', 4)

    df = logartimo_base2(df, 'NumInstallTradesWBalance')
    df = metodo_tukey(df, 'NumInstallTradesWBalance', 2)

    df = logartimo_base2(df, 'NumBank_NatlTradesWHighUtilization')
    df = metodo_tukey(df, 'NumBank_NatlTradesWHighUtilization', 1.5)

    return df

# Fuciones de Cross Validation

- ## Hold-Out

In [6]:
def hold_out(modelos, df):
    
    model_cross_holdout = []

    X = df.drop(["RiskPerformance"], axis= 1)
    y= df["RiskPerformance"]


    for modelo in modelos:

        accuracy_holdout, precision_holdout, recall_holdout = [], [], []

        for i in range(20):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
            modelo.fit(X_train, y_train)
            y_pred = modelo.predict(X_test)

            accuracy_holdout.append(accuracy_score(y_test, y_pred))
            precision_holdout.append(precision_score(y_test, y_pred, average = "macro"))
            recall_holdout.append(recall_score(y_test, y_pred, average = "macro"))

        model_cross_holdout.append([str(modelo).split("(")[0],
                                    modelo,
                                    np.array(accuracy_holdout).mean(),
                                    np.array(precision_holdout).mean(),
                                    np.array(recall_holdout).mean()
                                   ])

    df_cross_holdout = pd.DataFrame(model_cross_holdout, columns= ["nombre", "modelo" , "mean_accuracy", "mean_precision", "mean_recall"])
    df_cross_holdout.to_csv("cross_holdout_results.csv", index= False, sep= ",")
    
    return df_cross_holdout

## - k-Fold

In [7]:
def k_fold(modelos, df, splits= 5):
    
    model_cross_kfold = []

    X = df.drop(["RiskPerformance"], axis= 1)
    y= df["RiskPerformance"]

    kfold = KFold(n_splits = splits)
    for modelo in modelos:

        y_pred = []

        for train_index, test_index in kfold.split(X): 

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train = y.iloc[train_index]

            modelo.fit(X_train, y_train)
            y_pred_1 = modelo.predict(X_test)
            y_pred.extend(y_pred_1)

        model_cross_kfold.append([str(modelo).split("(")[0],
                                  modelo,
                                  accuracy_score(y, y_pred),
                                  precision_score(y, y_pred, average = "macro"),
                                  recall_score(y, y_pred, average = "macro")
                               ])

    df_cross_kfold = pd.DataFrame(model_cross_kfold, columns= ["nombre", "modelo", "accuracy", "precision", "recall"])
    df_cross_kfold.to_csv("cross_kfold_results.csv", index= False, sep= ",")
    
    return df_cross_kfold

## - Stratified k-Fold

In [8]:
def stratified_f_fold(modelos, df, splits= 5):    
    
    model_cross_skfold = []

    X = df.drop(["RiskPerformance"], axis= 1)
    y= df["RiskPerformance"]
    
    skfold = StratifiedKFold(n_splits = splits)
    for modelo in modelos:

        y_test_real, y_pred = [], []

        for train_index, test_index in skfold.split(X, y):

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            modelo.fit(X_train, y_train)
            y_pred_1 = modelo.predict(X_test)
            y_pred.extend(y_pred_1)
            y_test_real.extend(y_test)

        model_cross_skfold.append([str(modelo).split("(")[0],
                                   modelo,
                                   accuracy_score(y_test_real, y_pred),
                                   precision_score(y, y_pred, average = "macro"),
                                   recall_score(y, y_pred, average = "macro")
                                    ])

    df_cross_skfold = pd.DataFrame(model_cross_skfold, columns= ["nombre", "modelo" , "accuracy", "precision", "recall"])
    df_cross_skfold.to_csv("cross_skfold_results.csv", index= False, sep= ",")
    
    return df_cross_skfold

# Función para elegir el mejor modelo:

In [9]:
def best_model_generator(modelos, df_procesado):

    model_results = []

    for neighbors in tqdm.tqdm(range(1, 21), desc="Progreso"):
        
        imputer = KNNImputer(n_neighbors=neighbors)
        df_imputed = pd.DataFrame(imputer.fit_transform(df_procesado), columns=df_procesado.columns)
        
        X = df_imputed.drop(["RiskPerformance"], axis=1)
        y = df_imputed["RiskPerformance"]

        for modelo in modelos:

            cv_predictions = cross_val_predict(modelo, X, y, cv=5)

            avg_precision = precision_score(y, cv_predictions)
            avg_accuracy  = accuracy_score(y, cv_predictions)
            avg_recall    = recall_score(y, cv_predictions)

            model_results.append([str(modelo).split("(")[0], modelo, avg_accuracy, avg_precision, avg_recall, neighbors, imputer])      

    df_resultados_finales = pd.DataFrame(model_results, columns= ["nombre", "modelo" , "avg_accuracy", "avg_precision", "avg_recall", "neighbors", "imputer"])

    best_imputer = df_resultados_finales.sort_values(by= "avg_accuracy", ascending= False).head(1)["imputer"].values[0]
    num_vecinos = df_resultados_finales.sort_values(by= "avg_accuracy", ascending= False).head(1)["neighbors"].values[0]
    
    df_final = pd.DataFrame(best_imputer.transform(df_procesado), columns=df_procesado.columns)
    df_final.to_csv(f'df_final_{num_vecinos}k.csv', index=False)
    print(f"Se ha guardado el dataframe final como: df_final_{num_vecinos}k.csv ")
    
    return df_final, df_resultados_finales

# Función de tunning:

In [10]:
def tunning(modelo, parametros, scorer, X_train, X_test, y_train, y_test):
    
    resultados = []

    grid_solver = GridSearchCV(estimator  = modelo,
                               param_grid = parametros,
                               scoring    = scorer,
                               cv         = 5,
                               refit      = "accuracy",
                               n_jobs     = 1,
                               verbose    = 0
                              )

    model_result = grid_solver.fit(X_train, y_train)

    # Mejor modelo:
    best_model = model_result.best_estimator_
    params_best_model = best_model.get_params()

    y_pred = best_model.predict(X_test)

    # Metricas:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    resultados.append([str(modelo).split("(")[0], best_model, params_best_model, accuracy, precision, recall])
    df_resultados = pd.DataFrame(resultados, columns= ["Nombre", "Modelo", "Parametros","Accuracy", "Precision", "Recall"])
    df_resultados.to_csv("resultados_tuning_GBC.csv", index= "False", sep= ",")
    dump(best_model, 'mejor_modelo_gbc.pkl')
    
    return df_resultados

# Función para Evaluar el Feature Importance

In [11]:
def feature_importance_eval(modelo, X):
    
    # Calculamos Feature Importance
    importances = modelo.feature_importances_

    indices = np.argsort(importances)[::-1]

    columns_plot = []

    for f in range(X.shape[1]):

        feature = indices[f]
        importancia = importances[indices[f]]
        column_name = X.columns[f]

        columns_plot.append(column_name)

    plt.figure(figsize = (12, 8))

    plt.title("Feature Importances")

    plt.bar(range(X.shape[1]), importances[indices], color = "r", align = "center")
    plt.xticks(range(X.shape[1]), columns_plot, rotation = 90)
    plt.grid()
    
    return columns_plot

# Función para Evaluar la Complejidad

In [12]:
def complexity_evaluation(columns_plot, X_train, X_test, y_train, y_test, modelo):

    results_columnas = []
    
    num_cols = len(X_train.columns)

    for idx in range(1,(num_cols + 1)):

        columnas_menos_importantes = columns_plot[idx :]

        X_train_sincol, X_test_sincol = X_train.drop(columnas_menos_importantes, axis = 1), X_test.drop(columnas_menos_importantes, axis = 1)

        modelo.fit(X_train_sincol, y_train)

        y_pred = modelo.predict(X_test_sincol)

        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        results_columnas.append([idx, acc, pre, rec])

    df_results_columnas = pd.DataFrame(results_columnas, columns= ["idx_col", "Accuracy", "Precision", "Recall"])
    
    return df_results_columnas

# Funciones Red Neuronal

In [13]:
def imputer_knn(data, n_neighbors_range):
    imputed_data = {}

    for n_neighbors in n_neighbors_range:
        imputer = KNNImputer(n_neighbors=n_neighbors)
        imputed_data[n_neighbors] = pd.DataFrame(imputer.fit_transform(data), columns = data.columns)

    return imputed_data

In [14]:
def calculate_metrics(y_pred, y_test, threshold = 0.5):

    y_pred_threshold = (np.array(y_pred) > threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred_threshold)
    # True Negatives (TN)
    tn = cm[0][0]
    # False Positives (FP)
    fp = cm[0][1]    
    # False Negatives (FN)
    fn = cm[1][0]
    # True Positives (TP)
    tp = cm[1][1]
    accuracy  = (tp + tn) / (tp + fn + fp + tn )
    precision = tp / (tp + fp)
    recall    = tp / (tp + fn)
    
    return accuracy, precision, recall, cm

In [15]:
# Probar bucle con diferentes threshold

In [16]:
def FCNN(train, test_size, epochs, early_stopping= True):
    
    y = train['RiskPerformance']
    X = train.drop('RiskPerformance', axis=1)
    
    sc = StandardScaler()
    X = sc.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = test_size, random_state=42, stratify= y)

    batch_size = 256
    np.random.seed(5)   
    metrics = ["Accuracy", "Precision", "Recall"]

    #FC1
    model = Sequential()
    model.add(Dense(input_shape= (X_train.shape[1],), units = 150))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.2))

    #FC2
    model.add(Dense(units = 75))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.2))

    #FC3
    model.add(Dense(units = 25))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.2))

    #Output layer
    model.add(Dense(units= 1, kernel_initializer='uniform', activation='sigmoid'))
    model.build()
    model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=metrics)
    model.summary()
    
    if early_stopping:
        early_stops = EarlyStopping(patience=10, monitor='Accuracy')
        history = model.fit(X_train, y_train, validation_split=test_size, callbacks=[early_stops], batch_size= batch_size, epochs= epochs, verbose=0)
    else:
        history = model.fit(X_train, y_train, validation_split=test_size, batch_size= batch_size, epochs= epochs, verbose=0)

    y_pred = model.predict(X_test)

    return y_pred, y_test, model, history

# Metricas y Visualizaciones Red Neuronal

In [17]:
def plot_metrics(y_test, y_pred):
    
    # ROC Curve:
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    
    # Precision & Recall:
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    
    # Plots:    
    fig, axes = plt.subplots(1, 2, figsize = (12, 6))
    axes = axes.flatten()
    
    # ROC Plot:
    axes[0].set_title('Receiver Operating Characteristic')
    axes[0].plot(fpr, tpr)
    axes[0].plot([0, 1], [0, 1], 'k--')
    axes[0].set_xlim([-0.1, 1.1])
    axes[0].set_ylim([-0.1, 1.1])
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_xlabel('False Positive Rate')

    # Precision/Recall Plot:
    axes[1].set_title('Precision_Recall')
    axes[1].plot(recall, precision)
    axes[1].set_xlim([0, 1])
    axes[1].set_ylim([0, 1])
    axes[1].set_ylabel('Precision')
    axes[1].set_xlabel('Recall')
    
    plt.show()

In [18]:
def plot_history(history):
    
    accuracy     = history.history["Accuracy"]
    loss         = history.history["loss"]

    val_accuracy = history.history["val_Accuracy"]
    val_loss     = history.history["val_loss"]

    epochs = range(1, len(accuracy) + 1)
    
    # Plots:    
    figure, axes = plt.subplots(1, 2, figsize = (12, 6))
    axes = axes.flatten() 

    # Plot Accuracy
    axes[0].plot(epochs, accuracy, "r--", label="Train accuracy")
    axes[0].plot(epochs, val_accuracy, "b", label="Validation accuracy")

    axes[0].set_title("Training and validation accuracy")
    axes[0].set_ylabel("Accuracy")
    axes[0].set_xlabel("Epochs")
    axes[0].legend()

    # Plot Loss
    axes[1].plot(epochs, loss, "r--", label="Train loss")
    axes[1].plot(epochs, val_loss, "b", label="Validation loss")

    axes[1].set_title("Training and validation loss")
    axes[1].set_ylabel("Loss")
    axes[1].set_xlabel("Epochs")
    axes[1].legend()

    plt.show()