# Look at the big picture.


## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor


from datetime import datetime
from statistics import median, mean
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
%load_ext pycodestyle_magic

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [6]:
# Activamos las alertas de estilo
%pycodestyle_off

ValueError: Function <bound method VarWatcher.auto_run_pycodestyle of <pycodestyle_magic.VarWatcher object at 0x0000026359596AF0>> is not registered as a post_run_cell callback

## Performance Metric

In [2]:
def performance(y_true, y_pred):
    """

    """
    
    CF = sum([1 if p >= t else 0 for t,p in zip(y_true, y_pred) ]) / len(y_pred)
    print('CF: {}'.format(CF))
    
    MSE = mean_squared_error(y_true, y_pred)
    print('MSE: {}'.format(MSE))
    
    RMSE = np.sqrt(MSE)
    print('RMSE: {}'.format(RMSE))
    
    rRMSE = RMSE / mean(y_true)
    print('rRMSE: {}'.format(rRMSE))
    
    metric = (0.7 * rRMSE) + (0.3 * (1 - CF))
    
    return metric



# Get the data

In [3]:
dtypes = {
    "fecha": "str",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "Int64",
    "unidades_vendidas": "Int64"    
}

# Read the data
data = pd.read_csv('./data/Modelar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

## Overview the data

In [4]:
data.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas
0,1/6/2015 0:00:00,21972,0,C,75,No Rotura,,0,0,5241.0,0
1,1/6/2015 0:00:00,23910,5,C,170,No Rotura,6.07,0,0,5241.0,3
2,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
3,1/6/2015 0:00:00,24306,13,A,46,No Rotura,,0,0,,0
4,1/6/2015 0:00:00,27144,15,E,230,No Rotura,,0,0,4064.0,0


In [5]:
data.describe()

Unnamed: 0,visitas,precio,antiguedad,unidades_vendidas
count,4045022.0,1402111.0,3170857.0,4045022.0
mean,172.1371,34.24319,1011.114,4.693434
std,688.9116,23.30943,716.2509,22.37403
min,0.0,3.57,126.0,0.0
25%,7.0,16.52,524.0,0.0
50%,35.0,26.89,795.0,0.0
75%,130.0,45.35,1244.0,3.0
max,120045.0,175.78,5310.0,4881.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4045022 entries, 0 to 4045021
Data columns (total 11 columns):
fecha                object
id                   category
visitas              Int64
categoria_uno        category
categoria_dos        category
estado               category
precio               float64
dia_atipico          category
campaña              category
antiguedad           Int64
unidades_vendidas    Int64
dtypes: Int64(3), category(6), float64(1), object(1)
memory usage: 196.9+ MB


# Discover and visualize the data to gain insights

# Prepare the data for Machine Learning algorithms

## Preprocessing functions 

### Time format

In [7]:
class time_format(BaseEstimator, TransformerMixin):

    def __init__(self, document='Modelar'):
        self.document = document

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content
        before the first space.
        If it it the 'Estimar' document, the variable will be transformed to the
        original format

        Original Format: 'DD/MM/AAAA' in (text format)


        """

        temp = ''

        if self.document == 'Modelar':
            data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
        elif self.document == 'Estimar':
            # agregar la modificación para el documento Estimar
            data['fecha']
        else:
            print('Unknown document!!!')


        return data

### Scaler

In [8]:
class drop_duplicates(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        # nothing else to do
        return self


    def transform(self, data):
        """
        Take the dataframe and return the dataframe without duplicates.

        """

        return data.drop_duplicates()

### Data Preprocessing Pipeline

In [9]:
data_prep_pipeline = Pipeline([
         ('drop_dup' , drop_duplicates()),
         ('time_format', time_format(document='Modelar')),
         
])


data_prepared = data_prep_pipeline.fit_transform(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Missing Values

In [10]:
## Get the missing values per attribute
data.isnull().sum()

fecha                      0
id                         0
visitas                    0
categoria_uno              0
categoria_dos           5844
estado                     0
precio               2642911
dia_atipico                0
campaña                    0
antiguedad            874165
unidades_vendidas          0
dtype: int64

In [11]:
data_final = data_prepared.copy()
data_final['fecha'] = data_final['fecha'].apply( lambda i : datetime.strptime(i , "%d/%m/%Y"))  
data_final.sort_values(by=['fecha'], inplace=True, ascending=True)
data_final = data_final.reset_index(drop=True)

In [12]:
# "Precio" imputer
last_values = dict([ (i, 0) for i in data_final.id.unique() ])

df_index = 0

for index, value in zip(data_final.id, data_final.precio):
    if pd.notna(float(value)):
        last_values[index] = value
    else:
        data_final.at[df_index, 'precio'] = last_values[index]
   
    df_index += 1

Imputar la variable antiguedad por la media y la categoría dos por 0(de Momento).

In [13]:
# Get the median value
median = data_final['antiguedad'].median()

# Fill the NA Values with the median
data_final['antiguedad'].fillna(median, inplace=True)
#data_final['categoria_dos'].fillna(, inplace=True) # De momento dejadlo así, ya estoy buscando la manera de hacerlo con clustering


In [14]:
# "Antiguedad" imputer
#last_values = dict([ (i, 0) for i in data_final.id.unique() ])
list_values = []
df_index = 0
null_count = 0

for index, value in zip(data_final.id, data_final.categoria_dos):
    if  pd.notna(float(value)):
        # last_values[index] = value
        list_values.append(value)
    else:
        null_count += 1
        list_values.append(list_values[len(list_values)-1])
   
    df_index += 1
    
print(len(list_values), null_count)
data_final['categoria_dos'] = np.array(list_values)
data_final['categoria_dos'] = data_final['categoria_dos'].astype('category')

2040037 4393


In [15]:
data_final['dia_atipico'] = data_final['dia_atipico'].replace(['0', '1', '-1'], ["Venta_Normal", "Venta_alta", "Venta_Baja"])

In [16]:
data_final.dia_atipico

0          Venta_Normal
1          Venta_Normal
2          Venta_Normal
3          Venta_Normal
4          Venta_Normal
5          Venta_Normal
6          Venta_Normal
7          Venta_Normal
8          Venta_Normal
9          Venta_Normal
10         Venta_Normal
11         Venta_Normal
12         Venta_Normal
13         Venta_Normal
14         Venta_Normal
15         Venta_Normal
16         Venta_Normal
17         Venta_Normal
18         Venta_Normal
19         Venta_Normal
20         Venta_Normal
21         Venta_Normal
22         Venta_Normal
23         Venta_Normal
24         Venta_Normal
25         Venta_Normal
26         Venta_Normal
27         Venta_Normal
28         Venta_Normal
29         Venta_Normal
               ...     
2040007    Venta_Normal
2040008    Venta_Normal
2040009    Venta_Normal
2040010    Venta_Normal
2040011    Venta_Normal
2040012    Venta_Normal
2040013    Venta_Normal
2040014    Venta_Normal
2040015    Venta_Normal
2040016    Venta_Normal
2040017    Venta

### One Hot Encoding

In [17]:
# concatenamos las dos columnas 
data_final['categoria'] = data_final["categoria_uno"].str.cat(data_final.categoria_dos, sep ="") 
data_final['categoria']
data_final.drop(columns=['categoria_uno', 'categoria_dos'], inplace = True)
data_final

Unnamed: 0,fecha,id,visitas,estado,precio,dia_atipico,campaña,antiguedad,unidades_vendidas,categoria
0,2015-01-06,21972,0,No Rotura,0.00,Venta_Normal,0,5241,0,C75
1,2015-01-06,327312,12,Rotura,0.00,Venta_Normal,0,580,0,A236
2,2015-01-06,327330,7,Rotura,0.00,Venta_Normal,0,580,0,A236
3,2015-01-06,327348,16,Rotura,0.00,Venta_Normal,0,580,0,A236
4,2015-01-06,327380,2,Rotura,0.00,Venta_Normal,0,580,0,L131
5,2015-01-06,327386,3,Rotura,0.00,Venta_Normal,0,580,0,L131
6,2015-01-06,327468,27,Rotura,0.00,Venta_Normal,0,580,0,A127
7,2015-01-06,327474,20,Rotura,0.00,Venta_Normal,0,580,0,A127
8,2015-01-06,327480,19,Rotura,0.00,Venta_Normal,0,580,0,A304
9,2015-01-06,327518,11,Rotura,0.00,Venta_Normal,0,831,0,F336


In [18]:
data_final['dia'] = data_final['fecha'].map(lambda x: x.day)
data_final['mes'] = data_final['fecha'].map(lambda x: x.month)
data_final['año'] = data_final['fecha'].map(lambda x: x.year)
data_final['finde'] = data_final['fecha'].map(lambda x: x.weekday())

In [39]:
from sklearn.compose import ColumnTransformer

y = data_final["unidades_vendidas"].copy()

df = data_final.loc[:, ~data_final.columns.isin(['fecha', 'id', 'unidades_vendidas'])]
num_attribs = ['dia','mes','año','finde','visitas', 'precio', 'antiguedad']
cat_attribs = ['estado','dia_atipico', 'campaña','categoria']

full_pipeline = ColumnTransformer([
    ("num", Normalizer(), num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),])

df_prepared = full_pipeline.fit_transform(df)

data_model = data_final.join(enc_df)

data_model = data_model.drop(columns=['estado','dia_atipico','categoria_uno','categoria_dos'])

data_model

In [40]:
X = df_prepared.toarray()
X

array([[1.06856432e-03, 1.78094054e-04, 3.58859519e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.86142620e-03, 4.76904366e-04, 9.60962298e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.86145711e-03, 4.76909519e-04, 9.60972680e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.28949671e-03, 5.71932895e-03, 9.60847263e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.12723609e-03, 5.50298146e-03, 9.24500885e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.35099339e-03, 5.80132453e-03, 9.74622520e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [41]:
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X) 

In [42]:
sum(pca.explained_variance_ratio_)

0.8850618646375382

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size = 0.3, random_state = 25)

In [44]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 20)



In [45]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=20, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=10,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [46]:
preds = xg_reg.predict(X_test)

In [48]:
performance(y_test, preds)

CF: 0.7686140141042986
MSE: 259.5162909353722
RMSE: 16.10950933254555
rRMSE: 5.36983644418185


3.8283013066960048

In [35]:
preds

array([ 2.0797001e+01,  1.3238306e+00,  1.9826710e+00, ...,
        2.1914554e+00,  1.7607181e+00, -9.4900131e-03], dtype=float32)

In [49]:
# Instantiate model with 50 decision trees
rf = RandomForestRegressor(n_estimators = 25, random_state = 42, n_jobs = -1)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train, y_train,
                         scoring="neg_mean_squared_error", n_jobs=5,cv=5)
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores

In [50]:
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=25, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [51]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

In [52]:
# Performance RandomForest 
performance(y_test, predictions)

CF: 0.7939566544446841
MSE: 229.9427279028702
RMSE: 15.1638625654175
rRMSE: 5.0546208551391665


3.600047602264011

In [None]:
from sklearn.svm import SVR

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

svr_rbf.fit(X_train, y_train)

## Outliers

No debe haber outliers


## Preparation of Test Dataset

In [None]:
dtypes = {
    "fecha": "str",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "object" 
}

# Read the data
# data_test = pd.read_csv('./data/Estimar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',')

### Data Preprocessing Pipeline

In [None]:
"""data_prep_pipeline = Pipeline([
         ('drop_dup', drop_dup()),
         ('time_format', time_format(document='Modelar'))
])


data_test_prepared = data_prep_pipeline.fit_transform(data_test)"""

# Select a model and train it

# "Estimar" Dataset

## Read Data

In [34]:
est_dtypes = {
    "fecha": "str",
    "id": "category",
    "visitas": "Int64",
    "categoria_uno": "category",
    "categoria_dos": "category",
    "estado": "category",
    "precio": "Float64",
    "dia_atipico": "category",
    "campaña": "category",
    "antiguedad": "str"
}

# Read the data
data_estimar = pd.read_csv('./data/Estimar_UH2021.txt', delimiter="|", dtype=dtypes, decimal=',', na_values='-')

In [35]:
data_estimar.head()

Unnamed: 0,fecha,id,visitas,categoria_uno,categoria_dos,estado,precio,dia_atipico,campaña,antiguedad
0,2016-10-01,21972,5,C,75,No Rotura,5.84,0,0,5150
1,2016-10-02,21972,15,C,75,No Rotura,5.84,0,0,5150
2,2016-10-03,21972,5,C,75,No Rotura,5.84,0,0,5150
3,2016-10-04,21972,5,C,75,No Rotura,5.84,0,0,5150
4,2016-10-05,21972,0,C,75,No Rotura,5.84,0,0,5150


In [36]:
data_estimar.shape

(218263, 10)

In [40]:
data_estimar.isnull().sum()

fecha              0
id                 0
visitas            0
categoria_uno      0
categoria_dos    438
estado             0
precio             0
dia_atipico        0
campaña            0
antiguedad         0
dtype: int64

In [39]:
# Get the median value
median = data_estimar['antiguedad'].median()
data_estimar['antiguedad'].fillna(median, inplace=True)

In [41]:
data_estimar.isnull().sum()

fecha              0
id                 0
visitas            0
categoria_uno      0
categoria_dos    438
estado             0
precio             0
dia_atipico        0
campaña            0
antiguedad         0
dtype: int64

In [42]:
# "Antiguedad" imputer
#last_values = dict([ (i, 0) for i in data_final.id.unique() ])
list_values = []
df_index = 0
null_count = 0

for index, value in zip(data_estimar.id, data_estimar.categoria_dos):
    if  pd.notna(float(value)):
        # last_values[index] = value
        list_values.append(value)
    else:
        null_count += 1
        list_values.append(list_values[len(list_values)-1])
   
    df_index += 1
    
print(len(list_values), null_count)
data_estimar['categoria_dos'] = np.array(list_values)
data_estimar['categoria_dos'] = data_estimar['categoria_dos'].astype('category')

218263 438


In [43]:
# concatenamos las dos columnas 
data_estimar['categoria'] = data_estimar["categoria_uno"].str.cat(data_estimar.categoria_dos, sep ="") 
data_estimar['categoria']
data_estimar.drop(columns=['categoria_uno', 'categoria_dos'], inplace = True)
data_estimar

Unnamed: 0,fecha,id,visitas,estado,precio,dia_atipico,campaña,antiguedad,categoria
0,2016-10-01,21972,5,No Rotura,5.84,0,0,5150,C75
1,2016-10-02,21972,15,No Rotura,5.84,0,0,5150,C75
2,2016-10-03,21972,5,No Rotura,5.84,0,0,5150,C75
3,2016-10-04,21972,5,No Rotura,5.84,0,0,5150,C75
4,2016-10-05,21972,0,No Rotura,5.84,0,0,5150,C75
...,...,...,...,...,...,...,...,...,...
218258,2016-12-26,458660,1085,No Rotura,70.85,1,0,35,K340
218259,2016-12-27,458660,965,No Rotura,70.85,1,0,35,K340
218260,2016-12-28,458660,1005,No Rotura,70.85,1,0,35,K340
218261,2016-12-29,458660,745,No Rotura,70.85,1,0,35,K340


In [44]:
data_estimar['dia_atipico'] = data_estimar['dia_atipico'].replace(['0', '1', '-1'], ["Venta_Normal", "Venta_alta", "Venta_Baja"])


In [45]:
df_estimar = data_estimar.loc[:, ~data_estimar.columns.isin(['fecha', 'id'])]
num_attribs = ['visitas', 'precio', 'antiguedad']
cat_attribs = ['estado', 'campaña', 'dia_atipico','categoria']

In [46]:
data_estimar.isnull().sum()

fecha          0
id             0
visitas        0
estado         0
precio         0
dia_atipico    0
campaña        0
antiguedad     0
categoria      0
dtype: int64

In [47]:
full_pipeline = ColumnTransformer([
    ("num", Normalizer(), num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),])

df_estimated = full_pipeline.fit_transform(df_estimar)

In [48]:
X_estimar = df_estimated.toarray()
X_estimar

array([[9.70872705e-04, 1.13397932e-03, 9.99998886e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.91260713e-03, 1.13397504e-03, 9.99995115e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.70872705e-04, 1.13397932e-03, 9.99998886e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.96922891e-01, 7.02805839e-02, 3.47187077e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.94421408e-01, 9.45701433e-02, 4.67177843e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.97925764e-01, 5.77167677e-02, 2.85121647e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [61]:
pca_estimar = PCA(n_components=20, random_state=42)
X_estimar_pca = pca_estimar.fit_transform(X_estimar)

In [157]:
X_estimar_pca.

(218263, 10)

In [60]:
predictions = rf.predict(X_estimar_pca)

ValueError: Number of features of the model must match the input. Model n_features is 5 and input n_features is 20 

In [53]:
sum(pca_estimar.explained_variance_ratio_)

0.7217377761906145

In [54]:
def transform(data):
        """
        Take the dataframe and trate the 'fecha' attribute depending of the document.
        If it is the 'Modelar' document, the variable will be equal to the content
        before the first space.
        If it it the 'Estimar' document, the variable will be transformed to the
        original format

        Original Format: 'DD/MM/AAAA' in (text format)


        """

        temp = ''

        
        data['fecha'] = pd.to_datetime(data['fecha'],infer_datetime_format=True).apply(lambda x : x.strftime('%d/%m/%Y'))
       


        return data

In [55]:
data_estimar.head()

Unnamed: 0,fecha,id,visitas,estado,precio,dia_atipico,campaña,antiguedad,categoria
0,2016-10-01,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
1,2016-10-02,21972,15,No Rotura,5.84,Venta_Normal,0,5150,C75
2,2016-10-03,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
3,2016-10-04,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
4,2016-10-05,21972,0,No Rotura,5.84,Venta_Normal,0,5150,C75


In [56]:
df_results = transform(data_estimar)

In [57]:
df_results

Unnamed: 0,fecha,id,visitas,estado,precio,dia_atipico,campaña,antiguedad,categoria
0,01/10/2016,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
1,02/10/2016,21972,15,No Rotura,5.84,Venta_Normal,0,5150,C75
2,03/10/2016,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
3,04/10/2016,21972,5,No Rotura,5.84,Venta_Normal,0,5150,C75
4,05/10/2016,21972,0,No Rotura,5.84,Venta_Normal,0,5150,C75
...,...,...,...,...,...,...,...,...,...
218258,26/12/2016,458660,1085,No Rotura,70.85,Venta_alta,0,35,K340
218259,27/12/2016,458660,965,No Rotura,70.85,Venta_alta,0,35,K340
218260,28/12/2016,458660,1005,No Rotura,70.85,Venta_alta,0,35,K340
218261,29/12/2016,458660,745,No Rotura,70.85,Venta_alta,0,35,K340


In [59]:
df_results["UNIDADES"] = predictions
df_results["FECHA"] = df_results['fecha']
df_results["ID"] = df_results['id']

ValueError: Length of values does not match length of index

In [185]:
df_results = df_results[["FECHA", "ID", "UNIDADES"]]

In [186]:
df_results.head()

Unnamed: 0,FECHA,ID,UNIDADES
0,10/01/2016,21972,475.1
1,10/02/2016,21972,475.1
2,10/03/2016,21972,475.1
3,10/04/2016,21972,475.1
4,10/05/2016,21972,475.1


In [187]:
# Save the results
df_results.to_csv(r'Atmira_Universitat Oberta de Catalunya_UOC Data Miners_numeral.txt', index=None, sep='|', mode='a')

In [188]:
df_results.shape

(218263, 3)