In [12]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

pre-process

In [8]:
df = pd.read_csv(r'./Data/processed.csv')
X = df.drop(['KWH'], axis=1)
y = df['KWH']

In [9]:
best_df = SelectKBest(mutual_info_regression, k = 100).fit_transform(X, y)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(best_df, y, test_size=0.3, shuffle = True, random_state = 42)

In [14]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
reg = LinearRegression().fit(X_train, Y_train)
print(reg.score(X_test, Y_test))
print(reg.score(X_train, Y_train))

0.9858133554933989
0.986690471221065


models implementation

In [6]:
sgd = SGDRegressor(
    random_state = 42,
    shuffle= True, 
    early_stopping= True,
    validation_fraction= 0.2 
)

sgd.fit(X_train, y_train)
print("Score: {}".format(sgd.score(X_test, y_test)))


Score: 0.8251079463800479


In [None]:
# sgd = SGDRegressor(
#     loss = 'squared_loss',
#     penalty = 'l1',
#     alpha= 0.001,
#     max_iter = 1000,
#     tol = 1e-4,
#     shuffle= True,
#     verbose = 0,
#     random_state = 42,
#     learning_rate = 'adaptive',
#     early_stopping= True ,
#     validation_fraction= 0.2,
# )

In [9]:
params = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['l1', 'l2'],
    'alpha': [1, 0.1, 0.01, 0.001, 0.001, 0.0001, 0.00001], 
    'learning_rate': ['optimal', 'invscaling', 'adaptive'], 
    'tol': [1e-3, 1e-4, 1e-5]
}
model = GridSearchCV(sgd, param_grid = params, cv = 5, n_jobs=1, scoring='r2', verbose = 3)
model.fit(X_train, y_train)


 loss=squared_epsilon_insensitive, penalty=l2, tol=0.0001; total time=   0.0s
[CV 1/5] END alpha=1e-05, learning_rate=invscaling, loss=squared_epsilon_insensitive, penalty=l2, tol=1e-05; total time=   0.0s
[CV 2/5] END alpha=1e-05, learning_rate=invscaling, loss=squared_epsilon_insensitive, penalty=l2, tol=1e-05; total time=   0.0s
[CV 3/5] END alpha=1e-05, learning_rate=invscaling, loss=squared_epsilon_insensitive, penalty=l2, tol=1e-05; total time=   0.0s
[CV 4/5] END alpha=1e-05, learning_rate=invscaling, loss=squared_epsilon_insensitive, penalty=l2, tol=1e-05; total time=   0.0s
[CV 5/5] END alpha=1e-05, learning_rate=invscaling, loss=squared_epsilon_insensitive, penalty=l2, tol=1e-05; total time=   0.0s
[CV 1/5] END alpha=1e-05, learning_rate=adaptive, loss=squared_loss, penalty=l1, tol=0.001; total time=   0.2s
[CV 2/5] END alpha=1e-05, learning_rate=adaptive, loss=squared_loss, penalty=l1, tol=0.001; total time=   0.2s
[CV 3/5] END alpha=1e-05, learning_rate=adaptive, loss=squar

NameError: name 'estimator' is not defined

In [10]:
best_params = model.best_params_
print("\n\n Model: {} \n\n Best params: {} ".format(model, best_params))



 Model: GridSearchCV(cv=5,
             estimator=SGDRegressor(early_stopping=True, random_state=42,
                                    validation_fraction=0.2),
             n_jobs=1,
             param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.001, 0.0001, 1e-05],
                         'learning_rate': ['optimal', 'invscaling', 'adaptive'],
                         'loss': ['squared_loss', 'huber',
                                  'epsilon_insensitive',
                                  'squared_epsilon_insensitive'],
                         'penalty': ['l1', 'l2'],
                         'tol': [0.001, 0.0001, 1e-05]},
             scoring='r2', verbose=3) 

 Best params: {'alpha': 1e-05, 'learning_rate': 'adaptive', 'loss': 'squared_epsilon_insensitive', 'penalty': 'l1', 'tol': 0.001} 


In [11]:
sgd = SGDRegressor(
    random_state = 42,
    shuffle= True, 
    early_stopping= True,
    validation_fraction= 0.2,
    loss = 'squared_epsilon_insensitive',
    penalty = 'l1',
    alpha = 1e-05,
    learning_rate = 'adaptive',
    tol = 0.001, 

)
sgd.fit(X_train, y_train)

SGDRegressor(alpha=1e-05, early_stopping=True, learning_rate='adaptive',
             loss='squared_epsilon_insensitive', penalty='l1', random_state=42,
             validation_fraction=0.2)

In [13]:
print("\n\n Train summary: \n\n")
print("\n\n R2 Score: {} ".format(model.score(X_train, y_train)))

y_train_pred = sgd.predict(X_train)
print("\n\n MSE Score: {} ".format(mean_squared_error(y_train, y_train_pred)))
print("\n\n MAE Score: {} ".format(mean_absolute_error(y_train, y_train_pred)))

print("\n\n Test summary: \n\n")
y_test_pred = sgd.predict(X_test)

print("\n\n R2 Test Score: {} ".format(model.score(X_test, y_test)))

print("\n\n MSE Score: {} ".format(mean_squared_error(y_test, y_test_pred)))
print("\n\n MAE Score: {} ".format(mean_absolute_error(y_test, y_test_pred)))



 Train summary: 




 R2 Score: 0.9499688690451299 


 MSE Score: 3039695.1716091353 


 MAE Score: 1124.2350642041351 


 Test summary: 




 R2 Test Score: 0.9503455658056125 


 MSE Score: 2619006.1415181463 


 MAE Score: 1085.7797531133097 


In [14]:
print(y_test_pred[:10])
print(list(y_test[:10]))

[13233.9367092   2708.7482805  27548.10541351 23462.20094868
  3662.84483576 10571.55515894  3285.0298734  20020.33122668
  1400.71838817 14443.87892858]
[13796, 2467, 29209, 24099, 4501, 10469, 3997, 21627, 3101, 13540]


In [19]:
df = pd.read_csv(r'./Data/processed.csv')
X = df.drop(['KWH'], axis=1)
y = df['KWH']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, shuffle = True, random_state = 42)

scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


sgd = SGDRegressor(
    random_state = 42,
    shuffle= True, 
    early_stopping= True,
    validation_fraction= 0.2,
    loss = 'squared_epsilon_insensitive',
    penalty = 'l1',
    alpha = 1e-05,
    learning_rate = 'adaptive',
    tol = 0.001, 
)

sgd.fit(X_train, Y_train)
print("Score: {}".format(sgd.score(X_test, Y_test)))

Score: 0.9515730125936807


In [21]:
print("\n\n Train summary: \n\n")
print("\n\n R2 Score: {} ".format(sgd.score(X_train, Y_train)))

y_train_pred = sgd.predict(X_train)
print("\n\n MSE Score: {} ".format(mean_squared_error(Y_train, y_train_pred)))
print("\n\n MAE Score: {} ".format(mean_absolute_error(Y_train, y_train_pred)))

print("\n\n Test summary: \n\n")
y_test_pred = sgd.predict(X_test)

print("\n\n R2 Test Score: {} ".format(sgd.score(X_test, y_test)))

print("\n\n MSE Score: {} ".format(mean_squared_error(Y_test, y_test_pred)))
print("\n\n MAE Score: {} ".format(mean_absolute_error(Y_test, y_test_pred)))



 Train summary: 




 R2 Score: 0.9599462478664899 


 MSE Score: 2433508.7902546925 


 MAE Score: 1048.8571534582022 


 Test summary: 




 R2 Test Score: 0.9515730125936807 


 MSE Score: 2554264.8806721875 


 MAE Score: 1123.5811108095636 
