In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer

In [2]:
df=pd.read_csv("models_cc_data/df_cc_init.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,id,device_id,active_power,direct_power,charge_capacity,date_time,T,U,Ff,RRR,...,date_time_b,T_b,U_b,Ff_b,RRR_b,DD_WE_b,DD_NS_b,Po(p)_b,POA_b,hour
0,19128,6,65.24,66.29,195.2,2023-09-14 11:00:00,31.2,60.0,2.0,0.0,...,2023-09-14 08:00:00,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
1,19129,7,72.97,74.12,196.3,2023-09-14 11:00:00,31.2,60.0,2.0,0.0,...,2023-09-14 08:00:00,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
2,19130,8,65.92,66.98,199.0,2023-09-14 11:00:00,31.2,60.0,2.0,0.0,...,2023-09-14 08:00:00,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
3,19131,9,72.45,73.67,232.2,2023-09-14 11:00:00,31.2,60.0,2.0,0.0,...,2023-09-14 08:00:00,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
4,19132,10,60.01,60.98,182.0,2023-09-14 11:00:00,31.2,60.0,2.0,0.0,...,2023-09-14 08:00:00,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390,50268,6,8.44,8.78,55.4,2024-02-22 14:00:00,8.0,96.0,2.0,4.0,...,2024-02-22 11:00:00,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2391,50269,7,7.66,7.94,55.2,2024-02-22 14:00:00,8.0,96.0,2.0,4.0,...,2024-02-22 11:00:00,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2392,50270,8,8.21,8.45,54.5,2024-02-22 14:00:00,8.0,96.0,2.0,4.0,...,2024-02-22 11:00:00,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2393,50271,9,9.55,9.84,58.0,2024-02-22 14:00:00,8.0,96.0,2.0,4.0,...,2024-02-22 11:00:00,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14


In [3]:
from sklearn.model_selection import train_test_split

features = ['T', 'U', 'Ff',	'RRR', 'DD_WE', 'DD_NS', 'POA', 'Po(p)', 'hour',
            'T_b', 'U_b', 'Ff_b', 'RRR_b', 'DD_WE_b', 'DD_NS_b', 'POA_b', 'Po(p)_b', 'device_id']

X = df[features]
y = df['cc_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())
y_train = y_train.fillna(y_train.mean())
y_test = y_test.fillna(y_test.mean())

In [6]:
def neg_mse_non_negative(y_true, y_pred):
    y_pred_non_negative = np.maximum(0, y_pred)
    return -mean_squared_error(y_true, y_pred_non_negative)
non_negative_mse_scorer = make_scorer(neg_mse_non_negative, greater_is_better=True)

In [7]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print('knn:')
print('MAE:', mean_absolute_error(y_test, y_pred))
# print('MAPE:', np.mean(np.abs((y_test - y_pred)/y_test)))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

knn:
MAE: 10.206638830897704
MSE: 383.7861402922756
RMSE: 19.59046044104823
R2: 0.9305002796126414


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_neighbors': randint(1, 20),  # uniform discrete random variables
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'leaf_size': randint(20, 50),
    'p': [1, 2]
}

knn = KNeighborsRegressor()

scorer = make_scorer(mean_squared_error, greater_is_better=False)

knn_random = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=100, cv=5, scoring=scorer, n_jobs=-1)
knn_random.fit(X_train, y_train)

print("Best parameters:", knn_random.best_params_)
print("RMSE(cv):", np.sqrt(-knn_random.best_score_))

knn_best = knn_random.best_estimator_

knn_best.fit(X_train, y_train)
y_pred = knn_best.predict(X_test)

print("knn_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
# print('MAPE:', np.mean(np.abs((y_test - y_pred)/y_test)))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))
print()

Best parameters: {'leaf_size': 34, 'metric': 'manhattan', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
RMSE(cv): 15.885831040038612
knn_after_adjust:
MAE: 8.502548371743911
MSE: 303.21617402165566
RMSE: 17.41310351492966
R2: 0.9450906713426882



In [9]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=100)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
# params = dt.get_params()
# print(params)

print("dt:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

dt:
MAE: 5.640292275574113
MSE: 238.5300835073069
RMSE: 15.444419170279824
R2: 0.9568046566374018


In [12]:
param_distributions = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'max_features': [None, 1, 'sqrt', 'log2'],
    'criterion': ['squared_error']
}

dt = DecisionTreeRegressor(random_state=100)

dt_random = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_distributions,
    n_iter=100,
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=100,
    verbose=1 
)

dt_random.fit(X_train, y_train)

print("Best parameters:", dt_random.best_params_)
print("RMSE(cv):", np.sqrt(-dt_random.best_score_))

best_model = dt_random.best_estimator_

y_pred = best_model.predict(X_test)
print("dt_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 40, 'criterion': 'squared_error'}
RMSE(cv): 13.966638719526618
dt_after_adjust:
MAE: 6.248155880306194
MSE: 249.08870157736024
RMSE: 15.782544204828328
R2: 0.9548925995657557


In [13]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("rf:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

rf:
MAE: 4.614807933194157
MSE: 103.44614066597066
RMSE: 10.17084758837584
R2: 0.98126696851022


In [14]:
# Define a distribution of parameters to search
param_distributions = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1, 'sqrt', 'log2']
}

rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                               n_iter=100, cv=5, verbose=2, random_state=100, n_jobs=-1,
                               scoring=make_scorer(mean_squared_error, greater_is_better=False))

rf_random.fit(X_train, y_train)

print("Best parameters:", rf_random.best_params_)
print("RMSE(cv):", np.sqrt(-rf_random.best_score_))

best_model = rf_random.best_estimator_

y_pred = best_model.predict(X_test)
print("rf_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
RMSE(cv): 13.235320831516841
rf_after_adjust:
MAE: 7.0208953190569545
MSE: 208.87700907080273
RMSE: 14.45257793858254
R2: 0.9621745232521605


In [15]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=500, learning_rate=1, max_depth=5, random_state=100)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

print("gb:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))


gb:
MAE: 5.943131638072594
MSE: 140.24473324573034
RMSE: 11.842496917699846
R2: 0.9746031220956675


In [16]:
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None]
}

gb = GradientBoostingRegressor()

gb_random = RandomizedSearchCV(gb, param_distributions=param_distributions, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, 
                                   n_jobs=-1, scoring=make_scorer(mean_squared_error, greater_is_better=False))

gb_random.fit(X_train, y_train)

print("Best parameters:", gb_random.best_params_)
print("RMSE(cv):", np.sqrt(-gb_random.best_score_))

best_model = gb_random.best_estimator_

y_pred = best_model.predict(X_test)
print("gb_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'n_estimators': 400, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 6, 'learning_rate': 0.2}
RMSE(cv): 6.804324606306268
gb_after_adjust:
MAE: 3.552306611159305
MSE: 79.04694499514233
RMSE: 8.8908348874075
R2: 0.9856854117492271


In [17]:
import lightgbm as lgb
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.2,
    'feature_fraction': 0.9,
}
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_test])
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

print("gbm:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 1916, number of used features: 18
[LightGBM] [Info] Start training from score 64.584864
gbm:
MAE: 3.972775193264801
MSE: 83.74132101995828
RMSE: 9.151028413241775
R2: 0.984835308561891


In [18]:
param_distributions = {
    'num_leaves': [20, 30, 40, 50, 60],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [-1, 5, 10, 15],
    'min_child_samples': [20, 30, 40],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

gbm = lgb.LGBMRegressor()

# RandomizedSearchCV setup
gbm_random = RandomizedSearchCV(estimator=gbm, param_distributions=param_distributions,
                                   n_iter=100, cv=5, verbose=2, random_state=100, 
                                   n_jobs=-1, scoring='neg_mean_squared_error')

gbm_random.fit(X_train, y_train)

print("Best parameters:", gbm_random.best_params_)
print("RMSE(cv):", np.sqrt(-gbm_random.best_score_))

best_model = gbm_random.best_estimator_
y_pred = best_model.predict(X_test)

print("gbm_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, max_features=1, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END max_depth=50, max_features=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=   1.3s
[CV] END max_depth=30, max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=10