In [1]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
from tensorflow.keras import layers
from tensorflow.keras import backend as K

*Cargar Dataset y preparar el dataset*

In [2]:
dataset = pd.read_excel('bike_train.xlsx')

In [3]:
dataset.head(3)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32


In [4]:
dataset['day']=pd.DatetimeIndex(dataset['dteday']).day
dataset['month']=pd.DatetimeIndex(dataset['dteday']).month
dataset['year']=pd.DatetimeIndex(dataset['dteday']).year

In [5]:
#sacamos aquellas columnas que no son útiles para calcular el cnt
dataset.drop(columns = ['instant','dteday','mnth','casual','registered'], inplace=True)

In [6]:
#dividimos el dataset en entrenamiento y test, para el test simplesmente sacamos todo aquello que sea entrenamiento
train_dataset = dataset.sample(frac=0.8,random_state=42)
test_dataset = dataset.drop(train_dataset.index)

In [7]:
# Normalización
# creamos variables con los valores del describe() del dataset, que nos permitirá realizar una normalizacion en base a la media y desviación estandar
# así los datos serán relativos y no absolutos
train_stats = train_dataset.describe()
train_stats.pop("cnt")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,9599.0,2.206063,1.087274,1.0,1.0,2.0,3.0,4.0
yr,9599.0,0.277737,0.447906,0.0,0.0,0.0,1.0,1.0
hr,9599.0,11.606834,6.901898,0.0,6.0,12.0,18.0,23.0
holiday,9599.0,0.028961,0.167707,0.0,0.0,0.0,0.0,1.0
weekday,9599.0,3.018023,2.000778,0.0,1.0,3.0,5.0,6.0
workingday,9599.0,0.686217,0.464053,0.0,0.0,1.0,1.0,1.0
weathersit,9599.0,1.437337,0.650233,1.0,1.0,1.0,2.0,4.0
temp,9599.0,0.466313,0.188624,0.02,0.32,0.46,0.62,0.96
atemp,9599.0,0.449532,0.169553,0.0,0.303,0.4545,0.5909,1.0
hum,9599.0,0.625166,0.200811,0.0,0.47,0.62,0.79,1.0


In [8]:
#separamos la variable de predicción
train_labels = train_dataset.pop('cnt')
test_labels = test_dataset.pop('cnt')

In [9]:
# normalizacion por medio de la standarizacion, para no caer en datos absolutos distantes, y que sean más relativos
def norm(x):
    return (x - train_stats['mean'])/train_stats['std']

#creamos nuevos objetos para manipular los datos normalizados
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [10]:
normed_train_data.head(3)

Unnamed: 0,season,yr,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,day,month,year
1935,-0.189523,-0.620079,-0.23281,-0.172691,1.490409,-1.478746,-0.672586,-1.093778,-1.131988,-1.071486,-0.253872,1.212388,-0.734666,-0.620079
6494,1.649941,-0.620079,-1.102137,-0.172691,-1.008619,0.676178,-0.672586,-0.669655,-0.595873,1.219225,-0.728461,-1.420708,1.304511,-0.620079
1720,-1.109255,-0.620079,-0.522586,-0.172691,0.490798,0.676178,-0.672586,-0.563624,-0.595873,0.173466,-0.491564,0.182046,-0.734666,-0.620079


*MODELO*

In [11]:
# construimos una función para generar el rmse
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [16]:
#construimos el modelo de la red neuronal con 2 capas de 32 neuronas, con el metodo Dense, que conectará todos las neuronas a las subsiguientes.
#se usa una capa adicional para establacer un constraint que evitará el peso negativo en las neuronas
#y compilamos los resultados con las métricas deseadas para observar los niveles de rmse y accuracy para detectar overfitting
#intentamos remover algunas neuronas con dropout para ver si dan mejores resultados, pero no ocurre
def build_model():
    model = keras.Sequential([
        layers.Dense(32 , activation='relu',input_shape=[len(train_dataset.keys())]),
        layers.Dense(32, activation='relu'),
        layers.Dense(1,kernel_constraint=tf.keras.constraints.NonNeg()),
        tf.keras.layers.Dropout(0)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss=root_mean_squared_error,optimizer=optimizer,metrics=[root_mean_squared_error,'accuracy','mae','mse'])

    return model

In [17]:
# instanciamos el modelo
model = build_model()

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 32)                480       
                                                                 
 dense_4 (Dense)             (None, 32)                1056      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
 dropout_1 (Dropout)         (None, 1)                 0         
                                                                 
Total params: 1,569
Trainable params: 1,569
Non-trainable params: 0
_________________________________________________________________


*Entrenamiento*

In [19]:
# convertimos el tipo de dato de la columna cnt de INT a FLOAT para que pueda ser aceptado por el model.fit() del proximo paso
train_labels = pd.to_numeric(train_labels, errors='coerce',downcast='float')
train_labels

1935     125.0
6494       8.0
1720     299.0
9120      72.0
360      104.0
         ...  
5439     232.0
11085      4.0
8964      81.0
11660    121.0
5510     207.0
Name: cnt, Length: 9599, dtype: float32

In [20]:
#seleccionamos y variamos la cantidad de epoch para buscar mejores resultados

#podemos ver las diferencias entre accuracy y val_accuracy, que por no tener una diferencia significativa, nos demuestra que no estamos cayendo en overfitting
EPOCHS = 1000

history = model.fit(
    normed_train_data, train_labels,
    epochs=EPOCHS,validation_split = 0.2, verbose = 0,
    callbacks=[tfdocs.modeling.EpochDots()]
)


Epoch: 0, accuracy:0.0132,  loss:196.2192,  mae:139.4221,  mse:39805.9219,  root_mean_squared_error:196.2113,  val_accuracy:0.0068,  val_loss:168.5376,  val_mae:115.3875,  val_mse:29555.3184,  val_root_mean_squared_error:168.5376,  
....................................................................................................
Epoch: 100, accuracy:0.0132,  loss:111.1956,  mae:82.0166,  mse:12776.0645,  root_mean_squared_error:111.1920,  val_accuracy:0.0068,  val_loss:118.9081,  val_mae:86.5584,  val_mse:14684.0918,  val_root_mean_squared_error:118.9081,  
....................................................................................................
Epoch: 200, accuracy:0.0132,  loss:92.9470,  mae:65.8724,  mse:8931.7100,  root_mean_squared_error:92.9483,  val_accuracy:0.0068,  val_loss:99.4991,  val_mae:68.8787,  val_mse:10433.2021,  val_root_mean_squared_error:99.4991,  
....................................................................................................
Ep

In [21]:
#creamos una tabla con los resultados de cada ciclo epoch

import math
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.sort_values(by='root_mean_squared_error')
#hist['rmse'] = math.sqrt(hist['mse'])

Unnamed: 0,loss,root_mean_squared_error,accuracy,mae,mse,val_loss,val_root_mean_squared_error,val_accuracy,val_mae,val_mse,epoch
987,31.323847,31.323483,0.011330,21.093271,1055.192383,37.898502,37.898502,0.004688,25.380842,1535.607056,987
989,31.341402,31.341776,0.010939,21.088840,1063.417358,37.608631,37.608631,0.005729,25.205936,1512.067627,989
978,31.357409,31.361240,0.011069,21.092829,1063.548340,37.822712,37.822712,0.004688,25.488209,1527.217651,978
999,31.366541,31.368221,0.010809,21.115416,1064.456055,37.663837,37.663837,0.006250,25.368200,1519.287231,999
995,31.377419,31.376509,0.010678,21.091789,1060.539917,37.575645,37.575645,0.005729,25.163109,1514.189941,995
...,...,...,...,...,...,...,...,...,...,...,...
4,115.772987,115.770988,0.013153,86.022346,13811.627930,121.849579,121.849579,0.006771,90.640816,15370.568359,4
3,116.091400,116.095016,0.013153,86.398117,13945.483398,122.490280,122.490280,0.006771,90.273849,15558.394531,3
2,117.798477,117.800102,0.013153,88.049385,14263.772461,123.098175,123.098175,0.006771,90.565903,15731.562500,2
1,128.142410,128.140472,0.013153,92.547508,17186.955078,124.347099,124.347099,0.006771,92.816376,16026.858398,1


In [23]:
test_labels = pd.to_numeric(test_labels, errors='coerce',downcast='float')

In [24]:
#evaluamos las metricas con los datos de entrenamiento
loss, root_mean_squared_error, accuracy, mae, mse = model.evaluate(normed_train_data,train_labels, verbose=2)

300/300 - 0s - loss: 32.1201 - root_mean_squared_error: 32.1207 - accuracy: 0.0100 - mae: 21.6639 - mse: 1123.2024 - 288ms/epoch - 960us/step


In [25]:
#realizamos las predicciones en de los modelos para los datasets de entrenamiento y de test
test_predictions = model.predict(normed_test_data).flatten()
train_predictions = model.predict(normed_train_data).flatten()



In [26]:
#verificamos
train_predictions[0:10]

array([121.14637 ,   2.965047, 285.3516  ,  92.37122 , 109.3204  ,
       146.79878 , 377.42047 ,  75.66825 ,  92.16716 ,  55.556202],
      dtype=float32)

In [27]:
test_predictions[0:10]

array([ 10.621135 ,   6.6151733,   6.657875 ,  62.67168  , 114.88935  ,
       125.179756 , 126.323845 ,  66.681725 ,  57.491245 , 119.38607  ],
      dtype=float32)

Poninendo los resultados en el csv de test

In [24]:
#A partir de aquí realizaremos los pasos iniciales para cargar y normalizar la data (del dataset bike_test.xlsx) para finalmente subir las predicciones del cnt en el dataset bike_test.xlsx a un nuevo csv
dataset_test = pd.read_excel('bike_test.xlsx')

In [25]:
dataset_test['day'] = pd.DatetimeIndex(dataset_test['dteday']).day
dataset_test['month'] = pd.DatetimeIndex(dataset_test['dteday']).month
dataset_test['year'] = pd.DatetimeIndex(dataset_test['dteday']).year
original_test_dataset_instant  = dataset_test.pop('instant')
dataset_test.drop(columns = ['dteday', 'mnth'],inplace=True)

dataset_test.head(3)

Unnamed: 0,season,yr,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,day,month,year
5375,1,1,19,0,1,1,2,0.26,0.2576,0.6,0.1642,31,12,2012
5376,1,1,20,0,1,1,2,0.26,0.2576,0.6,0.1642,31,12,2012
5377,1,1,21,0,1,1,1,0.26,0.2576,0.6,0.1642,31,12,2012
5378,1,1,22,0,1,1,1,0.26,0.2727,0.56,0.1343,31,12,2012
5379,1,1,23,0,1,1,1,0.26,0.2727,0.65,0.1343,31,12,2012


In [26]:
dataset_test_normed = norm(dataset_test)
dataset_test_normed.head(3)

Unnamed: 0,season,yr,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,day,month,year
0,-0.189523,1.61253,-1.102137,-0.172691,-1.508424,-1.478746,-0.672586,0.284623,0.297653,0.273062,-0.846909,0.525493,-0.152044,1.61253
1,-0.189523,1.61253,-0.957249,-0.172691,-1.508424,-1.478746,-0.672586,0.178592,0.208005,0.472254,-0.728461,0.525493,-0.152044,1.61253
2,-0.189523,1.61253,-0.812361,-0.172691,-1.508424,-1.478746,-0.672586,0.178592,0.208005,0.024072,-0.491564,0.525493,-0.152044,1.61253


In [27]:
dataset_test_predictions = model.predict(dataset_test_normed).flatten()



In [28]:
#realizamos las predicciones para el dataset bike_test.xlsx redondeando los decimales a 0, ya que los registros deberían ser numeros enteros
dataset_test_predictions = dataset_test_predictions.round(decimals=0)

In [29]:
# No tiene sentido registros negativos, pero se mantendrán para no desbalancear la cantidad de registros
def noNegatives(x):
  if x<0:
   return 0
  else:
   return x

In [30]:
# sumamos las predicciones a la tabla del dataset de test para tenerlo completo
dataset_test_complete = dataset_test

dataset_test_complete['pred'] = dataset_test_predictions.reshape(-1,1)

#dataset_test_complete['pred'].apply(lambda x : noNegatives(x))

dataset_test_complete

Unnamed: 0,season,yr,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,day,month,year,pred
0,2,1,4,0,0,0,1,0.52,0.5000,0.68,0.0896,20,5,2012,45.0
1,2,1,5,0,0,0,1,0.50,0.4848,0.72,0.1045,20,5,2012,43.0
2,2,1,6,0,0,0,1,0.50,0.4848,0.63,0.1343,20,5,2012,66.0
3,2,1,7,0,0,0,1,0.52,0.5000,0.68,0.1940,20,5,2012,113.0
4,2,1,8,0,0,0,1,0.56,0.5303,0.56,0.1642,20,5,2012,181.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5375,1,1,19,0,1,1,2,0.26,0.2576,0.60,0.1642,31,12,2012,431.0
5376,1,1,20,0,1,1,2,0.26,0.2576,0.60,0.1642,31,12,2012,345.0
5377,1,1,21,0,1,1,1,0.26,0.2576,0.60,0.1642,31,12,2012,338.0
5378,1,1,22,0,1,1,1,0.26,0.2727,0.56,0.1343,31,12,2012,256.0


Cargar un csv con las predicciones del cnt de bike_test

In [32]:
# ya que queremos un csv con SOLO las predicciones de cnt, crearemos un nuevo csv
cnt_pred = dataset_test_complete['pred']
cnt_pred.to_csv('Wolfgang9333.csv',index=False)