In [None]:
from src.eda.input import input_parser
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
## for statistical tests
# import scipy
# import statsmodels.formula.api as smf
# import statsmodels.api as sm

from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
## for explainer
# from lime import lime_tabular

In [None]:
# Generate the dataframe files
input_parser(r'data/Modelar_UH2021.txt', 'base')
input_parser(r'data/Estimar_UH2021.txt', 'base')

In [None]:
# Exploring the data base
df = pd.read_csv(filepath_or_buffer=r'data/Modelar_UH2021_base.csv', sep='|')
df.head()

In [None]:
# Exploring the data type base
def recognize_type(dtf, col, max_cat=20):
    if (dtf[col].dtype == "O") | (dtf[col].nunique() < max_cat):
        return "cat"
    else:
        return "num"


dic_cols = {col: recognize_type(
    df, col, max_cat=20) for col in df.columns}
heatmap = df.isnull()
for k, v in dic_cols.items():
    if v == "num":
        heatmap[k] = heatmap[k].apply(lambda x: 0.5 if x is False else 1)
    else:
        heatmap[k] = heatmap[k].apply(lambda x: 0 if x is False else 1)
sns.heatmap(heatmap, cbar=False).set_title('Dataset Overview')
plt.show()
print("\033[1;37;40m Categerocial ", "\033[1;30;41m Numeric ", "\033[1;30;47m NaN ")


In [None]:
# Exploring the correlation
corr_matrix = df.corr()
sns.heatmap(corr_matrix, vmin=-1., vmax=1., annot=True, fmt='.2f', cmap="YlGnBu", cbar=True, linewidths=0.5)

In [None]:
# Preprocessing
# Definition of auxiliar methods and variables
actual_price = {}
first_price = {}
# Complete the prices with the sample with the same "id"
def __price_completer(row):
    precio = float(row['precio'].replace(',', '.')) if isinstance(row['precio'], str) else row['precio']
    identificador = str(row['id'])
    if math.isnan(precio):
        precio = actual_price.get(identificador, -1.0)
    else:
        actual_price[identificador] = precio
        if first_price.get(identificador) == None:
            first_price[identificador] = precio
    return precio

# Complete the price by proximity
def __price_completer_proximity(row):
    precio = row['precio']
    if precio == -1.0:
        identificador = str(row['id'])
        precio = first_price.get(identificador)
    return precio

# Method to preprocesing the input files
def input_parser(path, option):
    df = pd.read_csv(filepath_or_buffer=path, sep='|')
    # There are duplicated samples
    df.drop_duplicates(inplace=True)
    if option != 'base':
        # Drop the useless hour of 'fecha' column
        df['fecha'] = df['fecha'].apply(lambda x: x.replace(' 0:00:00', ''))
        # Completation of 'precio' column
        df['precio'] = df.apply(__price_completer, axis=1)
        df['precio'] = df.apply(__price_completer_proximity, axis=1)
        # Split of 'fecha' column
        df['fecha'] = pd.to_datetime(df['fecha'])
        df['dia'] = pd.DatetimeIndex(df['fecha']).day
        df['mes'] = pd.DatetimeIndex(df['fecha']).month
        df['anyo'] = pd.DatetimeIndex(df['fecha']).year
        df.drop('fecha', axis=1, inplace=True)

        # One-hot encoding of 'estado'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['estado'], prefix='estado'))], axis=1).drop(['estado'], axis=1)

        # One-hot encoding of 'categoria_uno'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['categoria_uno'], prefix='categoria_uno'))], axis=1).drop(['categoria_uno'], axis=1)

        # One-hot encoding of 'dia_atipico'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['dia_atipico'], prefix='dia_atipico'))], axis=1).drop(['dia_atipico'], axis=1)

        df['antiguedad'].fillna(0, inplace=True)

        if option == 'drop':
            # 'categoria_dos' drop
            df.drop('categoria_dos', axis=1, inplace=True)
        else:
            # 'categoria_dos' corrupted samples correction
            df['categoria_dos'] = df['categoria_dos'].apply(
                lambda x: 0 if math.isnan(x) else x)

    df.to_csv(index=False, path_or_buf=path.replace(
        '.txt', '') + "_" + option + ".csv", sep='|')

In [None]:
# "Modelar" and "Estimar" dataframes processed
input_parser(r"data/Modelar_UH2021.txt", 'drop')
modelar = pd.read_csv(r'data/Modelar_UH2021_drop.csv', sep='|', low_memory=False)
# "Estimar" dataframe has not samples with "estado" = "Rotura" and that column is
# converted to one-hot vector so it must be dropped
modelar = modelar.drop('estado_Rotura', axis=1)

In [None]:
# Exploring the data processed
modelar.head()

In [None]:
# Exploring the data type base
dic_cols = {col: recognize_type(
    modelar, col, max_cat=20) for col in modelar.columns}
heatmap = modelar.isnull()
for k, v in dic_cols.items():
    if v == "num":
        heatmap[k] = heatmap[k].apply(lambda x: 0.5 if x is False else 1)
    else:
        heatmap[k] = heatmap[k].apply(lambda x: 0 if x is False else 1)
sns.heatmap(heatmap, cbar=False).set_title('Dataset Overview')
plt.show()
print("\033[1;37;40m Categerocial ", "\033[1;30;47m Numeric ", "\033[1;30;41m NaN ")

In [None]:
# Exploring the correlation
corr_matrix = modelar.corr()
sns.heatmap(corr_matrix, vmin=-1., vmax=1., fmt='.2f', cmap="YlGnBu", cbar=True, linewidths=0.5)

In [None]:
# Other analysis
from src.eda.pcr import pcr
from src.eda.permutation import permutation
from src.eda.importance_corr import importance_corr
from src.eda.hiperparameter import hiperparameter_tuning

In [None]:
# Principal Component Regression and PLS
pcr()

In [None]:
# Permutation analysis importance
permutation()

In [None]:
# Correlation importance analysis
importance_corr()

In [None]:
from src.train.trainIsotonic import isotonic
from src.train.trainLogistic import logistic
from src.train.trainMLP import mlp
from src.train.trainRidge import kernel_ridge
from src.train.trainXGB import xgb_reg
from src.train.trainTF import tf

In [None]:
# Isotronic regressor
isotonic()

In [None]:
# Logistic regressor
logistic()

In [None]:
# MLP regressor
mlp()

In [None]:
# Kernel ridge regressor
kernel_ridge()

In [None]:
# XGBoost regressor
xgb_reg()

In [None]:
# TensorFlow neural network
tf()