In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

# Cosas de sklearn
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error, mean_squared_error
from sklearn.decomposition import PCA
from pygam.utils import generate_X_grid

import matplotlib.pyplot as plt
import seaborn as sn

#plt.rcParams["figure.figsize"] = (10, 6) # Tamaño gráficos
plt.rcParams["figure.dpi"] = 200 # resolución gráficos
sn.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

from pygam import LinearGAM

In [2]:
df = pd.read_csv('stedin_electricity_2019.csv')
df.head()

Unnamed: 0,net_manager,purchase_area,street,zipcode_from,zipcode_to,city,num_connections,delivery_perc,perc_of_active_connections,type_conn_perc,type_of_connection,annual_consume,annual_consume_lowtarif_perc,smartmeter_perc
0,8716874000009,Stedin Utrecht,Egelshoek,1213RC,1231AB,HILVERSUM,30,90.0,100,80,3x25,5909,90.0,53.33
1,8716874000009,Stedin Utrecht,Rembrandtlaan,1231AC,1231AC,LOOSDRECHT,17,100.0,100,41,1x25,3062,94.12,76.47
2,8716874000009,Stedin Utrecht,Rembrandtlaan,1231AD,1231AD,LOOSDRECHT,17,100.0,100,47,3x25,3959,82.35,52.94
3,8716874000009,Stedin Utrecht,Jan Steenlaan,1231AE,1231AE,LOOSDRECHT,20,100.0,100,45,3x25,4230,90.0,45.0
4,8716874000009,Stedin Utrecht,Pieter de Hooghlaan,1231AG,1231AG,LOOSDRECHT,17,94.12,100,71,1x25,3180,94.12,88.24


In [3]:
df.dtypes

net_manager                       int64
purchase_area                    object
street                           object
zipcode_from                     object
zipcode_to                       object
city                             object
num_connections                   int64
delivery_perc                   float64
perc_of_active_connections        int64
type_conn_perc                    int64
type_of_connection               object
annual_consume                    int64
annual_consume_lowtarif_perc    float64
smartmeter_perc                 float64
dtype: object

In [4]:
len(df['street'].value_counts())

26612

In [5]:
df_dummies_categorical = pd.get_dummies(df[['city', 'type_of_connection']], prefix = 'dummy')
df_dummies_categorical.head()

Unnamed: 0,dummy_'S GRAVENHAGE,dummy_'S-GRAVENDEEL,dummy_'S-GRAVENHAGE,dummy_'T GOY,dummy_ABBENBROEK,dummy_ABCOUDE,dummy_ACHTERVELD,dummy_ACHTHUIZEN,dummy_ACQUOY,dummy_ALBLASSERDAM,...,dummy_ZWARTEWAAL,dummy_ZWIJNDRECHT,dummy_1x25,dummy_1x35,dummy_1x50,dummy_3x25,dummy_3x35,dummy_3x50,dummy_3x63,dummy_3x80
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [6]:
x_train_pre, x_test_pre, y_train, y_test = train_test_split(pd.concat([df[['num_connections', 'delivery_perc', 
                                                                'perc_of_active_connections']],
                                                              df_dummies_categorical], axis = 1),
                                                    df['annual_consume'],
                                                    random_state = 12,
                                                    test_size = .30)

x_train_pre.reset_index(inplace = True, drop = True)
x_test_pre.reset_index(inplace = True, drop = True)
y_train.reset_index(inplace = True, drop = True)
y_test.reset_index(inplace = True, drop = True)

In [7]:
std_scaler = StandardScaler().fit(x_train_pre[['num_connections', 'delivery_perc', 'perc_of_active_connections']])


In [8]:
x_train_numeric_scaled = pd.DataFrame(std_scaler.transform(x_train_pre[['num_connections',
                                                           'delivery_perc', 'perc_of_active_connections']]),
                                     columns = ['num_connections', 'delivery_perc', 'perc_of_active_connections'])

x_train_numeric_scaled.reset_index(drop = True, inplace = True)

In [9]:
x_test_numeric_scaled = pd.DataFrame(std_scaler.transform(x_test_pre[['num_connections',
                                                         'delivery_perc', 'perc_of_active_connections']]),
                                    columns = ['num_connections', 'delivery_perc', 'perc_of_active_connections'])

x_test_numeric_scaled.reset_index(drop = True, inplace = True)

In [10]:
x_train_pre.drop(['num_connections', 'delivery_perc', 'perc_of_active_connections'], axis = 1, inplace = True)
x_test_pre.drop(['num_connections', 'delivery_perc', 'perc_of_active_connections'], axis = 1, inplace = True)


In [11]:
x_train_pre.shape

(57542, 271)

In [12]:
x_train_numeric_scaled.shape

(57542, 3)

In [13]:
x_train = pd.concat([x_train_numeric_scaled, x_train_pre], axis = 1)
x_test = pd.concat([x_test_numeric_scaled, x_test_pre], axis = 1)

In [14]:
ridge = RidgeCV(cv = 10).fit(x_train, y_train)
lasso = LassoCV(cv = 10).fit(x_train, y_train)
elastic_net = ElasticNetCV(cv = 10).fit(x_train, y_train)


In [15]:
print('\n################ RIDGE #################')
print('Median Absolute Error: {}'.format(median_absolute_error(y_test, ridge.predict(x_test))))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, ridge.predict(x_test)))))
print('Parámetro/s encontrados: {}'.format(ridge.alpha_))
print('\n################ LASSO #################')
print('Median Absolute Error: {}'.format(median_absolute_error(y_test, lasso.predict(x_test))))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, lasso.predict(x_test)))))
print('Parámetro/s encontrados: {}'.format(lasso.alpha_))
print('\n################ ELASTIC-NET #################')
print('Median Absolute Error: {}'.format(median_absolute_error(y_test, elastic_net.predict(x_test))))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, elastic_net.predict(x_test)))))
print('Parámetro/s encontrados: {}'.format(elastic_net.alpha_))


################ RIDGE #################
Median Absolute Error: 855.8472695399123
RMSE: 2497.208721557891
Parámetro/s encontrados: 1.0

################ LASSO #################
Median Absolute Error: 851.7989472327813
RMSE: 2497.8736074333647
Parámetro/s encontrados: 0.4099269680263932

################ ELASTIC-NET #################
Median Absolute Error: 1033.3370421027473
RMSE: 2933.3716712117916
Parámetro/s encontrados: 0.8198539360527864



--------
## GAM
Antes de entrenar los GAM voy a eliminar algunas columnas con PCA

In [16]:
pca = PCA(n_components = .95, svd_solver='full').fit(x_train)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [17]:
x_train_pca.shape

(57542, 59)

In [18]:
%%time
gam = LinearGAM(lam = .3).fit(x_train_pca, y_train)


  return matrix(data, dtype=dtype, copy=False)


CPU times: user 2min 55s, sys: 7.88 s, total: 3min 3s
Wall time: 1min 43s


In [19]:
gam.summary()

LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                    410.4769
Link Function:                     IdentityLink Log Likelihood:                               -947491.7514
Number of Samples:                        57542 AIC:                                          1895806.4565
                                                AICc:                                         1895812.3983
                                                GCV:                                           5723359.774
                                                Scale:                                        5649916.8702
                                                Pseudo R-Squared:                                    0.375
Feature Function   Data Type      Num Splines   Spline Order  Linear Fit  Lambda     P > x      Sig. Code 
feature 1          numerical      25 

In [20]:
print('\n################ LinearGAM #################')
print('Median Absolute Error: {}'.format(median_absolute_error(y_test, gam.predict(x_test_pca))))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, gam.predict(x_test_pca)))))
print('Lambda: {}'.format(gam.lam))



################ LinearGAM #################
Median Absolute Error: 789.4770881133586
RMSE: 2464.1188229383934
Lambda: 0.3


### Gridsearch

In [35]:
np.linspace(0, 100, 5)

array([  0.,  25.,  50.,  75., 100.])

In [34]:
%%time
search_lambda_params = {'lam': np.linspace(0, 100, 5)}
gridsearch_gam = LinearGAM().gridsearch(x_train_pca, y_train, **search_lambda_params)
gridsearch_gam.summary()

  return matrix(data, dtype=dtype, copy=False)
  return matrix(data, dtype=dtype, copy=False)
  return matrix(data, dtype=dtype, copy=False)
  return matrix(data, dtype=dtype, copy=False)
100% (5 of 5) |##########################| Elapsed Time: 0:06:10 Time:  0:06:10


LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                    207.5441
Link Function:                     IdentityLink Log Likelihood:                               -948519.3536
Number of Samples:                        57542 AIC:                                          1897455.7954
                                                AICc:                                         1897457.3198
                                                GCV:                                           5789294.786
                                                Scale:                                        5751721.1002
                                                Pseudo R-Squared:                                   0.3614
Feature Function   Data Type      Num Splines   Spline Order  Linear Fit  Lambda     P > x      Sig. Code 
feature 1          numerical      25 

In [22]:
print('\n################ LinearGAM #################')
print('Median Absolute Error: {}'.format(median_absolute_error(y_test, gridsearch_gam.predict(x_test_pca))))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, gridsearch_gam.predict(x_test_pca)))))
print('Parámetro/s encontrados: {}'.format(gridsearch_gam.lam))



################ LinearGAM #################
Median Absolute Error: 792.1920210345654
RMSE: 2473.069835517191
Parámetro/s encontrados: 5.0
