# Costants

In [1]:
file_path = '../data/Donnet_Auctions.xlsx'
tl_ini = 1

# Import libraries

## Generic libraries

In [2]:
import sys

sys.path.insert(1, '../shared')
from utils import verbosity



# Load Data

In [3]:
import pandas as pd
try:
  global df
  df = pd.read_excel(file_path)
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
  verbosity(f"Error: file not found at {file_path}", tl=tl_ini, level='error')
except Exception as e:
  verbosity(f"Error: {e}", tl= tl_ini, level='error')



In [4]:
df = df.drop ('Unnamed: 0', axis = 1)

# Data Analysis

## Import libraries

In [5]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns

## TODO: Just in Jupyter Notebooks
%matplotlib ipympl


## Correlation

In [None]:

# Correlación entre columnas numéricas
# ==============================================================================

def tidy_corr_matrix(corr_mat):
    '''
    Función para convertir una matriz de correlación de pandas en formato tidy
    '''
    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['variable_1','variable_2','r']
    corr_mat = corr_mat.loc[corr_mat['variable_1'] != corr_mat['variable_2'], :]
    corr_mat['abs_r'] = np.abs(corr_mat['r'])
    corr_mat = corr_mat.sort_values('abs_r', ascending=False)
    
    return(corr_mat)


corr_matrix = df.select_dtypes(include=['float64', 'int']).corr(method='pearson')
tidy_corr_matrix(corr_matrix).head(10)

In [None]:
# Heatmap matriz de correlaciones
# ==============================================================================
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))

sns.heatmap(
    corr_matrix,
    annot     = True,
    cbar      = False,
    annot_kws = {"size": 8},
    vmin      = -1,
    vmax      = 1,
    center    = 0,
    cmap      = sns.diverging_palette(20, 220, n=200),
    square    = True,
    ax        = ax
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation = 45,
    horizontalalignment = 'right',
)

ax.tick_params(labelsize = 10)

In [None]:
# Gráfico de distribución para cada variable numérica
# ==============================================================================
# Ajustar número de subplots en función del número de columnas
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(7, 4))
axes = axes.flat
columnas_numeric = df.select_dtypes(include=['float64', 'int']).columns

for i, colum in enumerate(columnas_numeric):
    sns.histplot(
        data    = df,
        x       = colum,
        stat    = "count",
        kde     = True,
        color   = (list(plt.rcParams['axes.prop_cycle'])*2)[i]["color"],
        line_kws= {'linewidth': 2},
        alpha   = 0.3,
        ax      = axes[i]
    )
    axes[i].set_title(colum, fontsize = 8, fontweight = "bold")
    axes[i].tick_params(labelsize = 8)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")

fig.tight_layout()
plt.subplots_adjust(top = 0.9)
fig.suptitle('Distribución variables numéricas', fontsize = 10, fontweight = "bold");

# Models

## Generic Classes

In [9]:
from dataclasses import dataclass, field
from typing import Optional, Protocol
from pydantic import BaseModel


class Model(BaseModel):
    ...


class CreateModel(Protocol):
    def create_model(self) -> Model: ...



## Linear regression

based in: Regresión lineal con Python by Joaquín Amat Rodrigo, disponible bajo una licencia Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0 DEED) en https://cienciadedatos.net/documentos/py10-regresion-lineal-python.html



### Import libraries

In [10]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
## TODO: Solo para Jupyter Notebooks
%matplotlib ipympl

# Preprocesado y modelado
# ==============================================================================
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from scipy import stats

# Configuración matplotlib
# ==============================================================================
plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

### Data treatment

In [11]:
# División de los datos en train y test
# ==============================================================================

usecols=['year', 'score', 'size_30kg_boxes', 'variety', 'rank', 'country']

variety_mapping = {
    'Caturra':0, 
    'Bourbon':1, 
    'Catuai':2, 
    'Other':3, 
    'Typica':4, 
    'Pacamara':5
}
rank_mapping = {
    'first': 0,
    'second': 1,
    'third': 2,
    'other': 3,
}
country_mapping = {
    'bolivia':0,
    'brazil':1,
    'colombia':2,
    'el-salvador':3,
    'honduras':4,
    'nicaragua':5
}

df['high_bid'] = df['High_Bid_v2']

df['year'] = df['Year']
df['score'] = df['Score_v2']
df['size_30kg_boxes'] = df['Size_30Kg_boxes_v2']
df['variety'] = df.Variety_v2.map(variety_mapping)
df['rank'] = df.Rank_v2.map(rank_mapping)
df['country'] = df.Country_v2.map(country_mapping)

X = df[usecols]
y = df['high_bid']

X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    ) 

# X_train.head()
df_train = pd.concat([X_train, y_train], axis=1)



### Model Creation

In [None]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le añade una columna de 1s para el intercept del modelo
X_train = sm.add_constant(X_train, prepend=True)

model = ols('high_bid ~ year + score + size_30kg_boxes + C(variety) + C(rank) + C(country)', data = df_train)
model = model.fit()
print(model.summary())




### Model evaluation

In [None]:
ci_intervals = model.conf_int(alpha=0.05)
ci_intervals.columns = ['2.5%', '97.5%']
ci_intervals

In [None]:
# Diagnóstico errores (residuos) de las predicciones de entrenamiento
# ==============================================================================
train_prediction = model.predict(exog=X_train)
waste   = train_prediction - y_train

prediction_test = model.predict(exog=X_test)
mae = mean_absolute_error(
        y_true  = y_test,
        y_pred  = prediction_test,
    )

verbosity(f"The value of variable mae is: {mae}", tl=tl_ini, level='notif')



In [None]:
# Gráficos
# ==============================================================================
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(9, 8))

axes[0, 0].scatter(y_train, train_prediction, edgecolors=(0, 0, 0), alpha = 0.4)
axes[0, 0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
axes[0, 0].set_title('Valor predicho vs valor real', fontsize=10)
axes[0, 0].set_xlabel('Real')
axes[0, 0].set_ylabel('Predicción')
axes[0, 0].tick_params(labelsize = 7)

axes[0, 1].scatter(list(range(len(y_train))), waste, edgecolors=(0, 0, 0), alpha=0.4)
axes[0, 1].axhline(y=0, linestyle='--', color='black', lw=2)
axes[0, 1].set_title('Residuos del modelo', fontsize = 10)
axes[0, 1].set_xlabel('id')
axes[0, 1].set_ylabel('Residuo')
axes[0, 1].tick_params(labelsize = 7)

sns.histplot(
    data     = waste,
    stat     = "density",
    kde      = True,
    line_kws = {'linewidth': 1},
    color    = "firebrick",
    alpha    = 0.3,
    ax       = axes[1, 0]
)

axes[1, 0].set_title('Distribución residuos del modelo', fontsize=10)
axes[1, 0].set_xlabel("Residuo")
axes[1, 0].tick_params(labelsize = 7)

sm.qqplot(
    waste,
    fit   = True,
    line  = 'q',
    ax    = axes[1, 1], 
    color = 'firebrick',
    alpha = 0.4,
    lw    = 2
)
axes[1, 1].set_title('Q-Q residuos del modelo', fontsize=10)
axes[1, 1].tick_params(labelsize=7)

axes[2, 0].scatter(train_prediction, waste, edgecolors=(0, 0, 0), alpha=0.4)
axes[2, 0].axhline(y=0, linestyle='--', color='black', lw=2)
axes[2, 0].set_title('Residuos del modelo vs predicción', fontsize=10)
axes[2, 0].set_xlabel('Predicción')
axes[2, 0].set_ylabel('Residuo')
axes[2, 0].tick_params(labelsize=7)

# Se eliminan los axes vacíos
fig.delaxes(axes[2,1])

fig.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle('Diagnóstico residuos', fontsize=12);

## Decision Tree

### Import libraries

In [16]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ------------------------------------------------------------------------------

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

### Data treatment

In [None]:
SEED = 100
usecols=['year', 'score', 'size_30kg_boxes', 'variety', 'rank', 'country']

df['high_bid'] = df['High_Bid_v2']

df['year'] = df['Year']
df['score'] = df['Score_v2']
df['size_30kg_boxes'] = df['Size_30Kg_boxes_v2']
df['variety'] = df['Variety_v2']
df['rank'] = df['Rank_v2']
df['country'] = df['Country_v2']

df_train_dt = df[usecols]

# División de los datos en train y test
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
                                        df_train_dt,
                                        df['high_bid'],
                                        random_state = SEED
                                    )

def get_dummies_and_concat(df:pd.DataFrame, cols_name_list:list)->pd.DataFrame:
    df = pd.concat([df, pd.get_dummies(df[cols_name_list])], axis=1)
    df.drop(columns=cols_name_list,inplace=True)
    return df

categorical_variables = ['variety', 'rank', 'country']     

X_train = get_dummies_and_concat(X_train, categorical_variables)
X_test = get_dummies_and_concat(X_test, categorical_variables)
df_train_dt = get_dummies_and_concat(df_train_dt, categorical_variables)


# X_train = pd.concat([X_train, pd.get_dummies(X_train[categorical_variables])], axis=1)
# X_train.drop(columns=categorical_variables,inplace=True)

X_test.head()
# pd.get_dummies(df_train_dt[['variety']])


### Model creation

In [None]:
modelo = DecisionTreeRegressor(
            max_depth         = 3,
            random_state      = SEED
          )

# Entrenamiento del modelo
# ------------------------------------------------------------------------------
modelo.fit(X_train, y_train)


In [None]:
# Estructura del árbol creado
# ------------------------------------------------------------------------------
fig, ax = plt.subplots(figsize=(12, 5))

print(f"Profundidad del árbol: {modelo.get_depth()}")
print(f"Número de nodos terminales: {modelo.get_n_leaves()}")

plot = plot_tree(
            decision_tree = modelo,
            feature_names = X_test.columns,
            class_names   = ['high_bid'],
            filled        = True,
            impurity      = False,
            fontsize      = 10,
            precision     = 2,
            ax            = ax
)


In [None]:
texto_modelo = export_text(
                    decision_tree = modelo,
                    feature_names = list(X_test.columns)
               )
print(texto_modelo)

### Measuring the importance of predictors


In [None]:
importancia_predictores = pd.DataFrame(
                            {'predictor': X_test.columns,
                             'importancia': modelo.feature_importances_}
                            )
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores.sort_values('importancia', ascending=False)

### Model evaluation

In [None]:
# Error de test del modelo inicial
#-------------------------------------------------------------------------------
predicciones = modelo.predict(X = X_test)

rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
       )
print(f"El error (rmse) de test es: {rmse}")

mae = mean_absolute_error(
        y_true  = y_test,
        y_pred  = predicciones,
       )
print(f"El error (mae) de test es: {mae}")