# Look at the big picture.


## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer

from datetime import datetime
from statistics import median
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
%load_ext pycodestyle_magic

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [6]:
# Activamos las alertas de estilo
%pycodestyle_off

ValueError: Function <bound method VarWatcher.auto_run_pycodestyle of <pycodestyle_magic.VarWatcher object at 0x0000026359596AF0>> is not registered as a post_run_cell callback

## Performance Metric

In [57]:
def performance(y_true, y_pred) -> float:
    """

    """

    CF = sum([1 for _ in y_true if y_pred > y_true]) / len(y_pred) * 100
    RMSE = mean_squared_error(y_true, y_pred)
    rRMSE = RMSE / median(y_true)
    # metric = (0.7 * rRMSE) + (0.3 * (1 - CF))

    return rRMSE

performance([1,2],[1,2])

TypeError: 'numpy.float64' object is not callable

In [98]:
data_final.describe()



Unnamed: 0,visitas,precio,antiguedad,unidades_vendidas
count,4045022.0,4045022.0,4045022.0,4045022.0
mean,172.1371,35.88408,964.4095,4.693434
std,688.9116,26.67653,640.3595,22.37403
min,0.0,0.0,126.0,0.0
25%,7.0,15.78,580.0,0.0
50%,35.0,28.92,795.0,0.0
75%,130.0,48.66,1114.0,3.0
max,120045.0,175.78,5310.0,4881.0


# Get the data

In [3]:
dtypes = {
    "fecha": "str",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "Int64",
    "unidades_vendidas": "Int64"    
}

# Read the data
data = pd.read_csv('./data/Modelar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

## Overview the data

In [4]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0


In [6]:
data.describe()

Unnamed: 0,visitas,precio,antiguedad,unidades_vendidas
count,4045022.0,1402111.0,3170857.0,4045022.0
mean,172.1371,34.24319,1011.114,4.693434
std,688.9116,23.30943,716.2509,22.37403
min,0.0,3.57,126.0,0.0
25%,7.0,16.52,524.0,0.0
50%,35.0,26.89,795.0,0.0
75%,130.0,45.35,1244.0,3.0
max,120045.0,175.78,5310.0,4881.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
 #   Column             Dtype   
---  ------             -----   
 0   fecha              object  
 1   id                 category
 2   visitas            Int64   
 3   categoria_uno      category
 4   categoria_dos      category
 5   estado             category
 6   precio             float64 
 7   dia_atipico        category
 8   campaña            category
 9   antiguedad         Int64   
 10  unidades_vendidas  Int64   
dtypes: Int64(3), category(6), float64(1), object(1)
memory usage: 196.9+ MB


# Discover and visualize the data to gain insights

# Prepare the data for Machine Learning algorithms

## Preprocessing functions 

### Time format

In [8]:
class time_format(BaseEstimator, TransformerMixin):

    def __init__(self, document='Modelar'):
        self.document = document

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content
        before the first space.
        If it it the 'Estimar' document, the variable will be transformed to the
        original format

        Original Format: 'DD/MM/AAAA' in (text format)


        """

        temp = ''

        if self.document == 'Modelar':
            data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
        elif self.document == 'Estimar':
            # agregar la modificación para el documento Estimar
            data['fecha']
        else:
            print('Unknown document!!!')


        return data

### Scaler

In [9]:
class scaler(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and return the dataframe without duplicates.

        """

        return data.drop_duplicates()

### Data Preprocessing Pipeline

In [13]:
data_prep_pipeline = Pipeline([
         ('time_format', time_format(document='Modelar'))
])


data_prepared = data_prep_pipeline.fit_transform(data)

## Missing Values

In [12]:
## Get the missing values per attribute
data.isnull().sum()

fecha                      0
id                         0
visitas                    0
categoria_uno              0
categoria_dos           5844
estado                     0
precio               2642911
dia_atipico                0
campaña                    0
antiguedad            874165
unidades_vendidas          0
dtype: int64

In [14]:
data_final = data_prepared.copy()
data_final['fecha'] = data_final['fecha'].apply( lambda i : datetime.strptime(i , "%d/%m/%Y"))  
data_final.sort_values(by=['fecha'], inplace=True, ascending=True)
data_final = data_final.reset_index(drop=True)

In [15]:
last_values = dict([ (i, 0) for i in data_final.id.unique() ])

df_index = 0

for index, value in zip(data_final.id, data_final.precio):
    if pd.notna(float(value)):
        last_values[index] = value
    else:
        data_final.at[df_index, 'precio'] = last_values[index]
   
    df_index += 1

Imputar la variable antiguedad por la media y la categoría dos por 0(de Momento).

In [16]:
# Get the median value
median = data_final['antiguedad'].median()

# Fill the NA Values with the median
data_final['antiguedad'].fillna(median, inplace=True)
#data_final['categoria_dos'].fillna(, inplace=True) # De momento dejadlo así, ya estoy buscando la manera de hacerlo con clustering


In [17]:
#last_values = dict([ (i, 0) for i in data_final.id.unique() ])
list_values = []
df_index = 0
null_count = 0

for index, value in zip(data_final.id, data_final.categoria_dos):
    if  pd.notna(float(value)):
        # last_values[index] = value
        list_values.append(value)
    else:
        null_count += 1
        list_values.append(list_values[len(list_values)-1])
   
    df_index += 1
    
print(len(list_values), null_count)
data_final['categoria_dos'] = np.array(list_values)
data_final['categoria_dos'] = data_final['categoria_dos'].astype('category')

4045022 5844


In [18]:
data_final['dia_atipico'] = data_final['dia_atipico'].replace(['0', '1', '-1'], ["Venta_Normal", "Venta_alta", "Venta_Baja"])

In [19]:
data_final.dia_atipico

0          Venta_Normal
1          Venta_Normal
2          Venta_Normal
3          Venta_Normal
4          Venta_Normal
               ...     
4045017    Venta_Normal
4045018    Venta_Normal
4045019    Venta_Normal
4045020    Venta_Normal
4045021    Venta_Normal
Name: dia_atipico, Length: 4045022, dtype: object

### One Hot Encoding

In [25]:
# concatenamos las dos columnas 
#data_final['categoria'] = data_final["categoria_uno"].str.cat(data_final.categoria_dos, sep ="") 
#data_final['categoria']
#data_final.drop(columns=['categoria_uno', 'categoria_dos'], inplace = True)
#data_final

In [80]:
from sklearn.compose import ColumnTransformer

y = data_final["unidades_vendidas"].copy()

df = data_final.loc[:, ~data_final.columns.isin(['fecha', 'id', 'unidades_vendidas'])]
num_attribs = ['visitas', 'precio', 'antiguedad']
cat_attribs = ['estado','dia_atipico','categoria_uno','categoria_dos']

full_pipeline = ColumnTransformer([
    ("num", Normalizer(), num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),])

df_prepared = full_pipeline.fit_transform(df)

In [83]:
df_prepared

<4045022x204 sparse matrix of type '<class 'numpy.float64'>'
	with 27455498 stored elements in Compressed Sparse Row format>

data_model = data_final.join(enc_df)

data_model = data_model.drop(columns=['estado','dia_atipico','categoria_uno','categoria_dos'])

data_model

In [84]:
y

0           0
1           0
2           0
3           0
4           0
           ..
4045017    18
4045018     0
4045019     0
4045020     0
4045021     0
Name: unidades_vendidas, Length: 4045022, dtype: Int64

In [25]:
X = df_prepared


0           0
1           0
2           0
3           0
4           0
           ..
4045017    18
4045018     0
4045019     0
4045020     0
4045021     0
Name: unidades_vendidas, Length: 4045022, dtype: int32

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [87]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [88]:
preds = xg_reg.predict(X_test)

In [89]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 20.096761


In [90]:
per = performance(y_test, preds)
print("RMSE: %f" % (per))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [70]:
# model = linear_model.LogisticRegression()
# model.fit(X, y)

KeyboardInterrupt: 

In [None]:
some_data = data_final.iloc[:5]
some_labels = y.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", model.predict(some_data_prepared))

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# predictions = model.predict(test_X)
# print(predictions)[0:5]

## Outliers

No debe haber outliers


## Preparation of Test Dataset

In [None]:
dtypes = {
    "fecha": "str",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "object" 
}

# Read the data
# data_test = pd.read_csv('./data/Estimar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

In [None]:
# data_test.dtypes

In [None]:
# data_test.head()

In [None]:
# data_test.describe()

In [None]:
# data_test.info()

### Data Preprocessing Pipeline

In [None]:
"""data_prep_pipeline = Pipeline([
         ('drop_dup', drop_dup()),
         ('time_format', time_format(document='Modelar'))
])


data_test_prepared = data_prep_pipeline.fit_transform(data_test)"""

In [None]:
# data_test_prepared.isnull().sum()

In [None]:
# data_test_prepared = data_prepared.copy()
# data_test_prepared['fecha'] = data_test_prepared['fecha'].apply( lambda i : datetime.strptime(i , "%d/%m/%Y"))  
# data_test_prepared.sort_values(by=['fecha'], inplace=True, ascending=True)
# data_test_prepared = data_test_prepared.reset_index(drop=True)

In [None]:
# data_test_prepared['antiguedad'] = data_test_prepared['antiguedad'].replace('-', np.nan)
# data_test_prepared['antiguedad'] = pd.to_numeric(data_test_prepared['antiguedad'], errors='coerce')

# Get the median value
# median = data_test_prepared['antiguedad'].median()

# Fill the NA Values with the median
# data_test_prepared['antiguedad'].fillna(median, inplace=True)
#data_final['categoria_dos'].fillna(, inplace=True) # De momento dejadlo así, ya estoy buscando la manera de hacerlo con clustering

In [None]:
# data_test_prepared.antiguedad

In [None]:
# data_test_prepared['dia_atipico'] = data_test_prepared['dia_atipico'].replace(['0', '1', '-1'], ["Venta_Normal", "Venta_alta", "Venta_Baja"])

In [None]:
# data_test_prepared.dia_atipico

In [None]:
# encoder_test = OneHotEncoder()

# enc_df_test = pd.DataFrame(encoder_test.fit_transform(data_test_prepared[['estado','dia_atipico','categoria_uno','categoria_dos']]).toarray())

# data_model_test = data_test_prepared.join(enc_df_test)

# data_model_test = data_model_test.drop(columns=['estado','dia_atipico','categoria_uno','categoria_dos'])1

# scaler = StandardScaler()
# data_model_test_scaler = scaler.fit_transform(data_model_test)

# data_model_test

# Select a model and train it

### Regresion Logistica 

In [None]:
#data_model['fecha'] = data_model['fecha'].astype('datetime64').astype(int).astype(float)
#data_model_test['fecha'] = data_model_test['fecha'].astype('datetime64').astype(int).astype(float)

In [None]:
# train_X = np.array(data_model.drop(['fecha','unidades_vendidas'], 1))
# train_y = np.array(data_model['unidades_vendidas'])
# train_X.shape

In [None]:
#from sklearn import utils
#lab_enc = preprocessing.LabelEncoder()
#training_y_encoded = lab_enc.fit_transform(train_y)
#print(training_y_encoded)
#print(utils.multiclass.type_of_target(train_y))
#print(utils.multiclass.type_of_target(train_y.astype('int')))
#print(utils.multiclass.type_of_target(training_y_encoded))

In [None]:
#train_X.shape

In [None]:
#training_y_encoded

In [None]:
#model = linear_model.LogisticRegression()
#model.fit(train_X, training_y_encoded)

In [None]:
####test_X = np.array(data_model_test.drop(['fecha'], 1))
###test_X.shape

##predictions = model.predict(test_X)
#print(predictions)[0:5]

# Fine-tune your model

# Present your solution