# Look at the big picture.


## Libraries

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from statistics import median

In [13]:
%load_ext pycodestyle_magic

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [14]:
# Activamos las alertas de estilo
%pycodestyle_on

## Performance Metric

In [2]:
def performance(y_true: list, y_pred: list) -> float:
    """

    """

    CF = [1 for _ in y_true if y_pred > y_true] / len(y_pred) * 100
    RMSE = mean_squared_errors(y_true, y_pred)
    sEMSE = RMSE / median(y_true)
    metric = (0.7 * rRMSE) + (0.3 * (1 - CF))

    return metric

# Get the data

In [142]:
# Read the data
data = pd.read_csv('./data/Modelar_UH2021.txt', delimiter="|")

## Overview the data

In [143]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75.0,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170.0,No Rotura,607.0,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46.0,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46.0,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230.0,No Rotura,,0,0,4064.0,0


In [5]:
data.describe()

Unnamed: 0,id,visitas,categoria_dos,dia_atipico,campaña,antiguedad,unidades_vendidas
count,4045022.0,4045022.0,4039178.0,4045022.0,4045022.0,3170857.0,4045022.0
mean,274518.0,172.1371,201.6524,0.06365503,0.002801715,1011.114,4.693434
std,83367.19,688.9116,117.9997,0.3597475,0.05285704,716.2509,22.37403
min,21972.0,0.0,2.0,-1.0,0.0,126.0,0.0
25%,212294.0,7.0,82.0,0.0,0.0,524.0,0.0
50%,292672.0,35.0,236.0,0.0,0.0,795.0,0.0
75%,337900.0,130.0,307.0,0.0,0.0,1244.0,3.0
max,458660.0,120045.0,343.0,1.0,1.0,5310.0,4881.0


In [93]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   fecha              object 
 1   id                 int64  
 2   visitas            int64  
 3   categoria_uno      object 
 4   categoria_dos      float64
 5   estado             object 
 6   precio             object 
 7   dia_atipico        int64  
 8   campaña            int64  
 9   antiguedad         float64
 10  unidades_vendidas  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 339.5+ MB


# Discover and visualize the data to gain insights

# Prepare the data for Machine Learning algorithms

In [147]:
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime

class time_format(BaseEstimator, TransformerMixin):
    
    def __init__(self, document='Modelar'):
        self.document = document
        
    def fit(self, X, y=None):
        return self  # nothing else to do    
        
    def transform(self, data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content 
        before the first space. 
        If it it the 'Estimar' document, the variable will be transformed to the 
        original format 

        Original Format: 'DD/MM/AAAA' in (text format)


        """  
        temp = ''

        if self.document == 'Modelar':
            data['fecha'] = pd.to_datetime(data['fecha'], infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
        elif self.document == 'Estimar':
            data['fecha']    # agregar la modificación para el documento Estimar
        else:
            print('Unknown document!!!')

    
        return data

In [148]:
from sklearn.pipeline import Pipeline

data_prep_pipeline = Pipeline([
         ('time_format', time_format(document='Modelar'))
])

data_prep_pipeline.fit_transform(data)

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,06/01/2015,21972,0,C,75.0,No Rotura,,0,0,5241.0,0
1,06/01/2015,23910,5,C,170.0,No Rotura,607,0,0,5241.0,3
2,06/01/2015,24306,13,A,46.0,No Rotura,,0,0,,0
3,06/01/2015,24306,13,A,46.0,No Rotura,,0,0,,0
4,06/01/2015,27144,15,E,230.0,No Rotura,,0,0,4064.0,0
...,...,...,...,...,...,...,...,...,...,...,...
4045017,30/09/2016,457416,1395,F,336.0,No Rotura,5038,0,0,130.0,9
4045018,30/09/2016,457422,1080,F,336.0,Rotura,,0,0,130.0,0
4045019,30/09/2016,458650,1385,K,340.0,No Rotura,,0,0,126.0,0
4045020,30/09/2016,458660,1915,K,340.0,No Rotura,6849,0,0,126.0,24


In [124]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,06/01/2015,21972,0,C,75.0,No Rotura,,0,0,5241.0,0
1,06/01/2015,23910,5,C,170.0,No Rotura,607.0,0,0,5241.0,3
2,06/01/2015,24306,13,A,46.0,No Rotura,,0,0,,0
3,06/01/2015,24306,13,A,46.0,No Rotura,,0,0,,0
4,06/01/2015,27144,15,E,230.0,No Rotura,,0,0,4064.0,0


# Select a model and train it

# Fine-tune your model

# Present your solution

## Carga de los datos

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75.0,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170.0,No Rotura,607.0,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46.0,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46.0,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230.0,No Rotura,,0,0,4064.0,0


## Data Cleaning