# Look at the big picture.


## Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from datetime import datetime
from statistics import median
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%load_ext pycodestyle_magic

In [3]:
# Activamos las alertas de estilo
%pycodestyle_on

## Performance Metric

In [31]:
def performance(y_true: list, y_pred: list) -> float:
    """

    """

    CF = [1 for _ in y_true if y_pred > y_true] / len(y_pred) * 100
    RMSE = mean_squared_errors(y_true, y_pred)
    sEMSE = RMSE / median(y_true)
    metric = (0.7 * rRMSE) + (0.3 * (1 - CF))

    return metric

# Get the data

In [32]:
dtypes = {
    "fecha": "string",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "Int64",
    "unidades_vendidas": "Int64"    
}

# Read the data
data = pd.read_csv('./data/Modelar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

## Overview the data

In [33]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0


In [5]:
data.describe()

Unnamed: 0,visitas,precio,antiguedad,unidades_vendidas
count,4045022.0,1402111.0,3170857.0,4045022.0
mean,172.1371,34.24319,1011.114,4.693434
std,688.9116,23.30943,716.2509,22.37403
min,0.0,3.57,126.0,0.0
25%,7.0,16.52,524.0,0.0
50%,35.0,26.89,795.0,0.0
75%,130.0,45.35,1244.0,3.0
max,120045.0,175.78,5310.0,4881.0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
 #   Column             Dtype   
---  ------             -----   
 0   fecha              string  
 1   id                 category
 2   visitas            Int64   
 3   categoria_uno      category
 4   categoria_dos      category
 5   estado             category
 6   precio             float64 
 7   dia_atipico        category
 8   campaña            category
 9   antiguedad         Int64   
 10  unidades_vendidas  Int64   
dtypes: Int64(3), category(6), float64(1), string(1)
memory usage: 196.9 MB


# Discover and visualize the data to gain insights

# Prepare the data for Machine Learning algorithms


## Missing values

In [22]:
data_prepared.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,06/01/2015,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,06/01/2015,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,06/01/2015,24306,13,A,46,No Rotura,,0,0,,0
4,06/01/2015,27144,15,E,230,No Rotura,,0,0,4064.0,0
5,06/01/2015,27504,7,C,157,No Rotura,,0,0,5261.0,0


In [14]:
data_prepared.dtypes

fecha                  object
id                   category
visitas                 Int64
categoria_uno        category
categoria_dos        category
estado               category
precio                float64
dia_atipico          category
campaña              category
antiguedad              Int64
unidades_vendidas       Int64
dtype: object

In [41]:
data_final = data_prepared.copy()
data_final['fecha'] = data_final['format_fecha'].apply( lambda i : datetime.strptime(i , "%d/%m/%Y"))  
data_final.sort_values(by=['fecha'], inplace=True, ascending=True)

In [42]:
data_final.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas,format_fecha
0,2015-01-06,21972,0,C,75,No Rotura,,0,0,5241,0,06/01/2015
5409,2015-01-06,327312,12,A,236,Rotura,,0,0,580,0,06/01/2015
5415,2015-01-06,327330,7,A,236,Rotura,,0,0,580,0,06/01/2015
5421,2015-01-06,327348,16,A,236,Rotura,,0,0,580,0,06/01/2015
5427,2015-01-06,327380,2,L,131,Rotura,,0,0,580,0,06/01/2015


In [38]:
data_prepared.head()


Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas,format_fecha
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0,06/01/2015
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3,06/01/2015
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0,06/01/2015
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0,06/01/2015
5,1/6/2015 0:00:00,27504,7,C,157,No Rotura,,0,0,5261.0,0,06/01/2015


In [28]:
data_prepared.head(50)

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,06/01/2015,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,06/01/2015,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,06/01/2015,24306,13,A,46,No Rotura,,0,0,,0
4,06/01/2015,27144,15,E,230,No Rotura,,0,0,4064.0,0
5,06/01/2015,27504,7,C,157,No Rotura,,0,0,5261.0,0
6,06/01/2015,30014,5,C,63,No Rotura,6.12,0,0,2954.0,3
7,06/01/2015,30522,0,C,76,No Rotura,,0,0,5241.0,0
8,06/01/2015,31180,11,B,224,No Rotura,8.05,0,0,5310.0,42
9,06/01/2015,34176,8,C,66,No Rotura,,0,0,3970.0,0
10,06/01/2015,35732,8,A,263,No Rotura,26.24,0,0,3069.0,9


def na_val(index):
    
    lista = data_final.precio[data_final['id'] == str(index)].to_list()
    pila = []
    
    for i in
     
    
    return lista
    
na_val(23910)

In [36]:
data_prepared.dtypes

fecha                  object
id                   category
visitas                 Int64
categoria_uno        category
categoria_dos        category
estado               category
precio                float64
dia_atipico          category
campaña              category
antiguedad              Int64
unidades_vendidas       Int64
dtype: object

In [35]:
import math

for index, value in zip(data.id, data.precio):
    if pd.isna(float(value)):
        # na_val(index)
        pass



In [41]:
data.head(50)

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas,precio_solved
1781636,1/1/2016 0:00:00,295520,17,A,82,No Rotura,,-1,0,715.0,0,295520
1780243,1/1/2016 0:00:00,251626,1,F,341,No Rotura,,-1,0,,0,251626
1780244,1/1/2016 0:00:00,251682,39,H,307,No Rotura,,-1,0,,0,251682
1780245,1/1/2016 0:00:00,251682,39,H,307,No Rotura,,-1,0,,0,251682
1780246,1/1/2016 0:00:00,251732,2,H,307,No Rotura,,-1,0,,0,251732
1780247,1/1/2016 0:00:00,251742,6,F,20,No Rotura,,-1,0,980.0,6,251742
1780248,1/1/2016 0:00:00,251754,2,F,20,No Rotura,,-1,0,980.0,3,251754
1780249,1/1/2016 0:00:00,251760,2,F,20,No Rotura,,-1,0,980.0,0,251760
1780250,1/1/2016 0:00:00,251760,2,F,20,No Rotura,,-1,0,980.0,0,251760
1780251,1/1/2016 0:00:00,251772,9,F,20,No Rotura,,-1,0,980.0,0,251772


In [32]:
precio_dict

{'295520': nan,
 '251626': nan,
 '251682': nan,
 '251732': nan,
 '251742': nan,
 '251754': nan,
 '251760': nan,
 '251772': nan,
 '251780': nan,
 '251786': nan,
 '251822': nan,
 '251892': nan,
 '251906': nan,
 '251926': nan,
 '251942': nan,
 '251946': nan,
 '252010': nan,
 '252026': nan,
 '252030': nan,
 '252060': nan,
 '252072': nan,
 '252090': nan,
 '252164': nan,
 '252170': nan,
 '251488': nan,
 '251306': nan,
 '251040': 18.75,
 '251050': 14.42,
 '251382': nan,
 '251076': nan,
 '251126': nan,
 '251156': 22.74,
 '251228': nan,
 '251266': nan,
 '252254': nan,
 '252262': nan,
 '253306': nan,
 '253346': nan,
 '253540': nan,
 '253614': nan,
 '253646': nan,
 '253708': nan,
 '253918': nan,
 '253956': nan,
 '253964': nan,
 '253282': nan,
 '253974': nan,
 '254040': 47.74,
 '254052': nan,
 '253192': nan,
 '253186': nan,
 '252318': 11.86,
 '252322': 11.86,
 '252540': nan,
 '252632': nan,
 '252672': nan,
 '252692': nan,
 '252700': nan,
 '252716': 21.07,
 '252868': nan,
 '252884': 8.29,
 '252888'

In [13]:
## Get the missing values per attribute
data.isnull().sum()

fecha                      0
id                         0
visitas                    0
categoria_uno              0
categoria_dos           4393
estado                     0
precio               1399886
dia_atipico                0
campaña                    0
antiguedad            480815
unidades_vendidas          0
dtype: int64

In [12]:
from sklearn.impute import SimpleImputer

imputer_median = SimpleImputer(strategy="median")

median = data['antiguedad'].median()

data['antiguedad'].fillna(median, inplace=True)



## Preprocessing functions 

### Time format

In [35]:
class time_format(BaseEstimator, TransformerMixin):

    def __init__(self, document='Modelar'):
        self.document = document

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content
        before the first space.
        If it it the 'Estimar' document, the variable will be transformed to the
        original format

        Original Format: 'DD/MM/AAAA' in (text format)


        """

        temp = ''

        if self.document == 'Modelar':
            data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
        elif self.document == 'Estimar':
            # agregar la modificación para el documento Estimar
            data['fecha']
        else:
            print('Unknown document!!!')


        return data

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
 #   Column             Dtype   
---  ------             -----   
 0   fecha              string  
 1   id                 category
 2   visitas            Int64   
 3   categoria_uno      category
 4   categoria_dos      category
 5   estado             category
 6   precio             float64 
 7   dia_atipico        category
 8   campaña            category
 9   antiguedad         Int64   
 10  unidades_vendidas  Int64   
dtypes: Int64(3), category(6), float64(1), string(1)
memory usage: 196.9 MB


### Drop duplicates

In [36]:
class drop_dup(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and return the dataframe without duplicates.

        """

        return data.drop_duplicates()

### Data Preprocessing Pipeline

In [37]:
data_prep_pipeline = Pipeline([
         ('drop_dup', drop_dup()),
         ('time_format', time_format(document='Modelar'))
])


data_prepared = data_prep_pipeline.fit_transform(data)
data_prepared.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['format_fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))


Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas,format_fecha
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0,06/01/2015
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3,06/01/2015
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0,06/01/2015
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0,06/01/2015
5,1/6/2015 0:00:00,27504,7,C,157,No Rotura,,0,0,5261.0,0,06/01/2015


In [21]:
data_prepared.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
1781636,01/01/2016,295520,17,A,82,No Rotura,,-1,0,715.0,0
1780243,01/01/2016,251626,1,F,341,No Rotura,,-1,0,,0
1780244,01/01/2016,251682,39,H,307,No Rotura,,-1,0,,0
1780246,01/01/2016,251732,2,H,307,No Rotura,,-1,0,,0
1780247,01/01/2016,251742,6,F,20,No Rotura,11.33,-1,0,980.0,6


## Outliers

No debe haber outliers


# Select a model and train it

# Fine-tune your model

# Present your solution