
## Caso práctico consumos


In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing, model_selection, metrics

ModuleNotFoundError: No module named 'sklearn'

# Leer Datos

In [None]:
data = pd.read_csv("../input/KAG_energydata_complete.csv")

In [None]:
data.head()

In [None]:
data.info()

# Resumen Datos

In [None]:
data.describe()

In [None]:
print('El Dataset tiene' , data.shape[0], 'filas')
print('El Dataset tiene'  , data.shape[1], 'columnas')

In [None]:
# Valores null
data.isnull().sum().sort_values(ascending = True)

### Limpieza de datos!

In [None]:
from sklearn.model_selection import train_test_split

# 75% de datos para entrenar, 25% para testear
train, test = train_test_split(data,test_size=0.25,random_state=40)

In [None]:
train.describe()

###  De momento podemos ignorar la columna de la fecha y hora

In [None]:
# Nos quedan las siguentes features

col_temp = ["T1","T2","T3","T4","T5","T6","T7","T8","T9"]

col_hum = ["RH_1","RH_2","RH_3","RH_4","RH_5","RH_6","RH_7","RH_8","RH_9"]

col_weather = ["T_out", "Tdewpoint","RH_out","Press_mm_hg",
                "Windspeed","Visibility"] 
col_light = ["lights"]

col_randoms = ["rv1", "rv2"]

col_target = ["Appliances"]

In [None]:
# Podemos ver las features y la columna target
feature_vars = train[col_temp + col_hum + col_weather + col_light + col_randoms ]
target_vars = train[col_target]

In [None]:
feature_vars.describe()

In [None]:
# Vamos a mirar alguna distribución de los datos!
feature_vars.lights.value_counts()

In [None]:
target_vars.describe()

###  Primeras preguntas

1. Mirar rangos temperaturas

2. Mirar rangos humedades

3. Distribución consumo aparatos electricos

4. Nos puede dar información útil la columna del consumo de luces?

In [None]:
# Podems borrar la column de las luces
_ = feature_vars.drop(['lights'], axis=1 , inplace= True) ;

In [None]:
feature_vars.head(2)

# Visualización 

In [None]:
# plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Time-series
visData = go.Scatter( x= data.date  ,  mode = "lines", y = data.Appliances )
layout = go.Layout(title = 'Appliance energy consumption measurement' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)

iplot(fig)

In [None]:
# Añadir columan indicando días entre setmana (0) y fines de semana (1) 

data['WEEKDAY'] = ((pd.to_datetime(data['date']).dt.dayofweek)// 5 == 1).astype(float)
# 5472 lecturas en fin de semana
data['WEEKDAY'].value_counts()

In [None]:
# Filas de lecturas entre semana
temp_weekday =  data[data['WEEKDAY'] == 0]
# Graficar
visData = go.Scatter( x= temp_weekday.date  ,  mode = "lines", y = temp_weekday.Appliances )
layout = go.Layout(title = 'Appliance energy consumption measurement on weekdays' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)

iplot(fig)

In [None]:
# Filas de lecturas fin de semana
temp_weekend =  data[data['WEEKDAY'] == 1]
# Graficar
visData = go.Scatter( x= temp_weekend.date  ,  mode = "lines", y = temp_weekend.Appliances )
layout = go.Layout(title = 'Appliance energy consumption measurement on weekend' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)

iplot(fig)

In [None]:
# Histograma de "features"
feature_vars.hist(bins = 20 , figsize= (12,16)) ;

In [None]:
# RH_6 , RH_out , Visibility , Windspeed distribucines irregulares
f, ax = plt.subplots(2,2,figsize=(12,8))
vis1 = sns.distplot(feature_vars["RH_6"],bins=10, ax= ax[0][0])
vis2 = sns.distplot(feature_vars["RH_out"],bins=10, ax=ax[0][1])
vis3 = sns.distplot(feature_vars["Visibility"],bins=10, ax=ax[1][0])
vis4 = sns.distplot(feature_vars["Windspeed"],bins=10, ax=ax[1][1])

In [None]:
# Distribucion "Appliances"
f = plt.figure(figsize=(12,5))
plt.xlabel('Appliance consumption in Wh')
plt.ylabel('Frequency')
sns.distplot(target_vars , bins=10 ) ;

### Observaciones

1. Temperature - ¿Qué distribuciones vemos?
2. Humidity  - ¿Qué distribuciones vemos?
3. Appliance - ¿Es simétrica? Hay outliers
4. Visibilty - ¿Es simétrica? 
5. Windspeed - ¿Es simétrica? 


In [None]:
#Appliance con consumos bajos (menores a 200)
print('Percentage of the appliance consumption is less than 200 Wh')
print(((target_vars[target_vars <= 200].count()) / (len(target_vars)))*100 )

### Correlation Plots

In [None]:
# Correlaciones
train_corr = train[col_temp + col_hum + col_weather +col_target+col_randoms]
corr = train_corr.corr()
# Quitar valores repetidos
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
  
f, ax = plt.subplots(figsize=(16, 14))
#Generar Heat Map,
sns.heatmap(corr, annot=True, fmt=".2f" , mask=mask,)
    # xticks
plt.xticks(range(len(corr.columns)), corr.columns);
    # yticks
plt.yticks(range(len(corr.columns)), corr.columns)
    # plot
plt.show()

In [None]:
# Función para ver solo una mitad de la matriz
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

# Function to get top correlations 

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train_corr, 40))

### Observaciones basadas en las correlaciones

1. Temperature

2. Weather attributes - (Visibility, Tdewpoint, Press_mm_hg)  

3. Humidity 

4. Variables aleatoras 


# Data Pre Processing

In [None]:
#Split training dataset 
train_X = train[feature_vars.columns]
train_y = train[target_vars.columns]

In [None]:
#Split testing dataset
test_X = test[feature_vars.columns]
test_y = test[target_vars.columns]

In [None]:
# Quitar columnas que se han visto que no muestran información (train)
train_X.drop(["rv1","rv2","Visibility","T6","T9"],axis=1 , inplace=True)

In [None]:
# Due to conlusion made above below columns are removed
test_X.drop(["rv1","rv2","Visibility","T6","T9"], axis=1, inplace=True)

In [None]:
train_X.columns

In [None]:
test_X.columns

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

# Create test and training set by including Appliances column

train = train[list(train_X.columns.values) + col_target ]

test = test[list(test_X.columns.values) + col_target ]

# Create dummy test and training set to hold scaled values

sc_train = pd.DataFrame(columns=train.columns , index=train.index)

sc_train[sc_train.columns] = sc.fit_transform(train)

sc_test= pd.DataFrame(columns=test.columns , index=test.index)

sc_test[sc_test.columns] = sc.fit_transform(test)


In [None]:
sc_train.head()

In [None]:
sc_test.head()

In [None]:
# sacar Appliances columna de training set

train_X =  sc_train.drop(['Appliances'] , axis=1)
train_y = sc_train['Appliances']

test_X =  sc_test.drop(['Appliances'] , axis=1)
test_y = sc_test['Appliances']

In [None]:
train_X.head()

In [None]:
train_y.head()

# Implementacion del modelo

Probaremos los siguientes modelos:

**Linear regression models**

1.Ridge regression 

2.Lasso regression 

**Support Vector Machine**

3.Support vector regression 

**Nearest neighbour Regressor**

4.KNeighborsRegressor

**Ensmble models**

5.Random Forest Regressor

6.Gradient Boosting Regressor

7.ExtraTrees Regressor

**Neural Network**

8.Multi Layer Preceptron Regressor



In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn import neighbors
from sklearn.svm import SVR


In [None]:
# Lista de modelos a probar
models = [
           ['Lasso: ', Lasso()],
           ['Ridge: ', Ridge()],
           ['KNeighborsRegressor: ',  neighbors.KNeighborsRegressor()],
           ['SVR:' , SVR(kernel='rbf')],
           ['RandomForest ',RandomForestRegressor()],
           ['ExtraTreeRegressor :',ExtraTreesRegressor()],
           ['GradientBoostingClassifier: ', GradientBoostingRegressor()] ,
           ['XGBRegressor: ', xgb.XGBRegressor()] ,
           ['MLPRegressor: ', MLPRegressor(  activation='relu', solver='adam',learning_rate='adaptive',max_iter=1000,learning_rate_init=0.01,alpha=0.01)]
         ]


In [None]:
# Correr los modelos
import time
from math import sqrt
from sklearn.metrics import mean_squared_error

model_data = []
for name,curr_model in models :
    curr_model_data = {}
    curr_model.random_state = 78
    curr_model_data["Name"] = name
    start = time.time()
    curr_model.fit(train_X,train_y)
    end = time.time()
    curr_model_data["Train_Time"] = end - start
    curr_model_data["Train_R2_Score"] = metrics.r2_score(train_y,curr_model.predict(train_X))
    curr_model_data["Test_R2_Score"] = metrics.r2_score(test_y,curr_model.predict(test_X))
    curr_model_data["Test_RMSE_Score"] = sqrt(mean_squared_error(test_y,curr_model.predict(test_X)))
    model_data.append(curr_model_data)

In [None]:
model_data

In [None]:
# Pasar a datframe
df = pd.DataFrame(model_data)

In [None]:
df

In [None]:
df.plot(x="Name", y=['Test_R2_Score' , 'Train_R2_Score' , 'Test_RMSE_Score'], kind="bar" , title = 'R2 Score Results' , figsize= (10,8)) ;

### Obervaciones
1. Mejor modelo Extra Tree Regressor con R2 score of 0.57
2. Menor RMSE score también Extra Tree Regressor 0.65
2. Lasso regularization la peor!


# Configurar parámetros 

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{
              'max_depth': [80, 150, 200,250],
              'n_estimators' : [100,150,200,250],
              'max_features': ["auto", "sqrt", "log2"]
            }]
reg = ExtraTreesRegressor(random_state=40)
# Inizializar
grid_search = GridSearchCV(estimator = reg, param_grid = param_grid, cv = 5, n_jobs = -1 , scoring='r2' , verbose=2)
grid_search.fit(train_X, train_y)

In [None]:
# Tuned parameter set
grid_search.best_params_

In [None]:
# Mejores parámetros para ExtraTreesRegressor
grid_search.best_estimator_

In [None]:
# R2 score en training
grid_search.best_estimator_.score(train_X,train_y)

In [None]:
# R2 score en test
grid_search.best_estimator_.score(test_X,test_y)

In [None]:
# RMSE score en test
np.sqrt(mean_squared_error(test_y, grid_search.best_estimator_.predict(test_X)))

### Observaciones


1. Mejor combinación - 'max_depth': 80, 'max_features': 'sqrt', 'n_estimators': 200

    
2. Training set  R2 score de 1.0 (overfitting?)


3. Test set R2 score de 0.63 mejora el de 0.57 


4. Test set RMSE score de 0.60 mejora el de 0.65 




### Features 

In [None]:
# Mejores features
feature_indices = np.argsort(grid_search.best_estimator_.feature_importances_)

In [None]:
importances = grid_search.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
names = [train_X.columns[i] for i in indices]
# Create plot
plt.figure(figsize=(10,6))

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(train_X.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(train_X.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
# top 5
names[0:5]

In [None]:
# ultimas 5
names[-5:]

In [None]:
# Reducimos set a las 5 mejores
train_important_feature = train_X[names[0:5]]
test_important_feature = test_X[names[0:5]]

In [None]:
# Volvemos a probar!

from sklearn.base import clone
cloned_model = clone(grid_search.best_estimator_)
cloned_model.fit(train_important_feature , train_y)

In [None]:
# Nuevos scores

print('Training set R2 Score - ', metrics.r2_score(train_y,cloned_model.predict(train_important_feature)))
print('Testing set R2 Score - ', metrics.r2_score(test_y,cloned_model.predict(test_important_feature)))
print('Testing set RMSE Score - ', np.sqrt(mean_squared_error(test_y, cloned_model.predict(test_important_feature))))


### Observaciones

1. Features

    a. top 5 - 'RH_out', 'RH_8', 'RH_1', 'T3', 'RH_3'
    
    b. peores 5 - 'T7','Tdewpoint','Windspeed','T1','T5'
    

3. R2 baja mucho así que mejor no usar el dataset reducido