# Look at the big picture.


## Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from datetime import datetime
from statistics import median
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%load_ext pycodestyle_magic

In [3]:
# Activamos las alertas de estilo
%pycodestyle_on

## Performance Metric

In [6]:
def performance(y_true: list, y_pred: list) -> float:
    """

    """

    CF = [1 for _ in y_true if y_pred > y_true] / len(y_pred) * 100
    RMSE = mean_squared_errors(y_true, y_pred)
    sEMSE = RMSE / median(y_true)
    metric = (0.7 * rRMSE) + (0.3 * (1 - CF))

    return metric

# Get the data

In [7]:
dtypes = {
    "fecha": "string",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "Int64",
    "unidades_vendidas": "Int64"    
}

# Read the data
data = pd.read_csv('./data/Modelar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

## Overview the data

In [8]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0


In [6]:
data.describe()

Unnamed: 0,visitas,precio,antiguedad,unidades_vendidas
count,4045022.0,1402111.0,3170857.0,4045022.0
mean,172.1371,34.24319,1011.114,4.693434
std,688.9116,23.30943,716.2509,22.37403
min,0.0,3.57,126.0,0.0
25%,7.0,16.52,524.0,0.0
50%,35.0,26.89,795.0,0.0
75%,130.0,45.35,1244.0,3.0
max,120045.0,175.78,5310.0,4881.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
 #   Column             Dtype   
---  ------             -----   
 0   fecha              string  
 1   id                 category
 2   visitas            Int64   
 3   categoria_uno      category
 4   categoria_dos      category
 5   estado             category
 6   precio             float64 
 7   dia_atipico        category
 8   campaña            category
 9   antiguedad         Int64   
 10  unidades_vendidas  Int64   
dtypes: Int64(3), category(6), float64(1), string(1)
memory usage: 196.9 MB


# Discover and visualize the data to gain insights

# Prepare the data for Machine Learning algorithms

In [9]:
## Get the missing values per attribute
data.isnull().sum()

fecha                      0
id                         0
visitas                    0
categoria_uno              0
categoria_dos           5844
estado                     0
precio               2642911
dia_atipico                0
campaña                    0
antiguedad            874165
unidades_vendidas          0
dtype: int64

## Preprocessing functions 

### Time format

In [10]:
class time_format(BaseEstimator, TransformerMixin):

    def __init__(self, document='Modelar'):
        self.document = document

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content
        before the first space.
        If it it the 'Estimar' document, the variable will be transformed to the
        original format

        Original Format: 'DD/MM/AAAA' in (text format)


        """

        temp = ''

        if self.document == 'Modelar':
            data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
        elif self.document == 'Estimar':
            # agregar la modificación para el documento Estimar
            data['fecha']
        else:
            print('Unknown document!!!')


        return data

### Drop duplicates

In [11]:
class drop_dup(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and return the dataframe without duplicates.

        """

        return data.drop_duplicates()

### Data Preprocessing Pipeline

In [55]:
data_prep_pipeline = Pipeline([
         ('drop_dup', drop_dup()),
         ('time_format', time_format(document='Modelar'))
])


data_prepared = data_prep_pipeline.fit_transform(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))


## Missing Values

In [74]:
data_final = data_prepared.copy()
data_final['fecha'] = data_final['fecha'].apply( lambda i : datetime.strptime(i , "%d/%m/%Y"))  
data_final.sort_values(by=['fecha'], inplace=True, ascending=True)
data_final.reset_index(drop=True)

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,2015-01-06,21972,0,C,75,No Rotura,,0,0,5241,0
1,2015-01-06,327312,12,A,236,Rotura,,0,0,580,0
2,2015-01-06,327330,7,A,236,Rotura,,0,0,580,0
3,2015-01-06,327348,16,A,236,Rotura,,0,0,580,0
4,2015-01-06,327380,2,L,131,Rotura,,0,0,580,0
...,...,...,...,...,...,...,...,...,...,...,...
2040032,2016-12-09,327262,40,A,236,Rotura,,0,0,580,0
2040033,2016-12-09,327036,45,A,127,Rotura,,0,0,,0
2040034,2016-12-09,326868,10,C,72,No Rotura,8.25,0,0,581,3
2040035,2016-12-09,326838,0,K,7,Rotura,,0,0,,0


In [75]:
data_final[data_final['id'] == str(325846)]

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
5367,2015-01-06,325846,19,F,336,Transito,72.74,0,0,,3
254547,2015-01-07,325846,13,F,336,Rotura,,0,0,,0
512033,2015-01-08,325846,22,F,336,No Rotura,,0,0,,0
769519,2015-01-09,325846,20,F,336,No Rotura,,0,0,,0
1018699,2015-01-10,325846,95,F,336,No Rotura,68.87,0,0,,12
...,...,...,...,...,...,...,...,...,...,...,...
2870937,2016-12-05,325846,0,F,336,No Rotura,,0,0,,0
3128423,2016-12-06,325846,0,F,336,No Rotura,,0,0,,0
3377603,2016-12-07,325846,0,F,336,Rotura,,0,0,,0
3635089,2016-12-08,325846,10,F,336,Rotura,,0,0,,0


In [None]:
last_values = dict([ (i, 0) for i in data_final.id.unique() ])

df_index = 0

for index, value in zip(data_final.id, data_final.precio):
    if pd.notna(float(value)):
        last_values[index] = value
    else:
        data_final.at[df_index, 'precio'] = last_values[index]
   
    df_index += 1

In [77]:
data_final[data_final['id'] == str(325846)]

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
5367,2015-01-06,325846,19,F,336,Transito,72.74,0,0,,3
254547,2015-01-07,325846,13,F,336,Rotura,,0,0,,0
512033,2015-01-08,325846,22,F,336,No Rotura,,0,0,,0
769519,2015-01-09,325846,20,F,336,No Rotura,,0,0,,0
1018699,2015-01-10,325846,95,F,336,No Rotura,68.87,0,0,,12
...,...,...,...,...,...,...,...,...,...,...,...
2870937,2016-12-05,325846,0,F,336,No Rotura,,0,0,,0
3128423,2016-12-06,325846,0,F,336,No Rotura,,0,0,,0
3377603,2016-12-07,325846,0,F,336,Rotura,,0,0,,0
3635089,2016-12-08,325846,10,F,336,Rotura,,0,0,,0


Inputar la variable antiguedad por la media.

In [None]:
data_final['test'] = df['precio'].apply(lambda x : )

In [None]:


median = data['antiguedad'].median()

data['antiguedad'].fillna(median, inplace=True)



In [20]:
data_final[data_final['id'] == str(325798)]

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
5365,2015-01-06,325798,10,E,323,No Rotura,,0,0,587,0
254545,2015-01-07,325798,24,E,323,No Rotura,,0,0,587,0
512031,2015-01-08,325798,35,E,323,No Rotura,33.45,0,0,587,9
769517,2015-01-09,325798,5,E,323,No Rotura,,0,0,587,0
1018697,2015-01-10,325798,1,E,323,No Rotura,,0,0,587,0
...,...,...,...,...,...,...,...,...,...,...,...
2870935,2016-12-05,325798,0,E,323,No Rotura,,0,0,587,0
3128421,2016-12-06,325798,85,E,323,No Rotura,29.51,0,0,587,6
3377601,2016-12-07,325798,75,E,323,Rotura,,0,0,587,0
3635087,2016-12-08,325798,15,E,323,Rotura,,0,0,587,0


## Outliers

No debe haber outliers


# Select a model and train it

# Fine-tune your model

# Present your solution