In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
df=pd.read_csv("models_cc_data/df_cc_init.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,id,device_id,active_power,direct_power,charge_capacity,T,U,Ff,RRR,DD_WE,...,active_power_b,T_b,U_b,Ff_b,RRR_b,DD_WE_b,DD_NS_b,Po(p)_b,POA_b,hour
0,19128,6,65.24,66.29,195.2,31.2,60.0,2.0,0.0,1,...,31.25,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
1,19129,7,72.97,74.12,196.3,31.2,60.0,2.0,0.0,1,...,35.22,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
2,19130,8,65.92,66.98,199.0,31.2,60.0,2.0,0.0,1,...,39.62,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
3,19131,9,72.45,73.67,232.2,31.2,60.0,2.0,0.0,1,...,50.39,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
4,19132,10,60.01,60.98,182.0,31.2,60.0,2.0,0.0,1,...,25.24,27.5,83.0,0.0,0.0,-1.0,0.0,996.39,441.37,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390,50268,6,8.44,8.78,55.4,8.0,96.0,2.0,4.0,1,...,4.75,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2391,50269,7,7.66,7.94,55.2,8.0,96.0,2.0,4.0,1,...,4.74,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2392,50270,8,8.21,8.45,54.5,8.0,96.0,2.0,4.0,1,...,4.91,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14
2393,50271,9,9.55,9.84,58.0,8.0,96.0,2.0,4.0,1,...,5.23,8.5,98.0,2.0,5.0,1.0,0.0,1011.46,959.15,14


In [4]:
from sklearn.model_selection import train_test_split

features = ['T', 'U', 'Ff',	'RRR', 'DD_WE', 'DD_NS', 'POA', 'Po(p)', 'hour',
            'T_b', 'U_b', 'Ff_b', 'RRR_b', 'DD_WE_b', 'DD_NS_b', 'POA_b', 'Po(p)_b', 'device_id']

X = df[features]
y = df['cc_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())
y_train = y_train.fillna(y_train.mean())
y_test = y_test.fillna(y_test.mean())

In [14]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print('knn:')
print('MAE:', mean_absolute_error(y_test, y_pred))
# print('MAPE:', np.mean(np.abs((y_test - y_pred)/y_test)))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))
print()

knn:
MAE: 10.202630480167015
MSE: 383.5932584551149
RMSE: 19.585536971324398
R2: 0.9305352085283658



In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_neighbors': randint(1, 20),  # uniform discrete random variables
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'leaf_size': randint(20, 50),
    'p': [1, 2]
}

# Define the model
knn = KNeighborsRegressor()

# Define the scorer
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Setup the randomized search
random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=100, cv=5, scoring=scorer, n_jobs=-1)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", random_search.best_params_)
print("Best score (negative mean squared error):", random_search.best_score_)

# Use the best parameters for your model
knn_best = random_search.best_estimator_

Best parameters: {'leaf_size': 46, 'metric': 'manhattan', 'n_neighbors': 12, 'p': 2, 'weights': 'distance'}
Best score (negative mean squared error): -252.45984234741908


In [17]:
knn_best.fit(X_train, y_train)
y_pred = knn_best.predict(X_test)

print("knn_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
# print('MAPE:', np.mean(np.abs((y_test - y_pred)/y_test)))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))
print()

knn_after_adjust:
MAE: 8.55702128380056
MSE: 303.12851378319976
RMSE: 17.410586256160354
R2: 0.9451065457097436



In [15]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=100)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

print("dtr:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

dtr:
MAE: 5.717745302713988
MSE: 245.89887265135698
RMSE: 15.681162987844907
R2: 0.9554702447570903


In [25]:
from sklearn.model_selection import train_test_split

# Define the parameter space
param_distributions = {
    'max_depth': [None, 10, 20, 30, 40, 50],  # None means no maximum depth
    'min_samples_split': np.arange(2, 11),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 11),     # Minimum number of samples required to be at a leaf node
    'max_features': [None, 1, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'criterion': ['squared_error']  # Function to measure the quality of a split
}

# Create a DecisionTreeRegressor instance
dtr = DecisionTreeRegressor(random_state=100)

# Set up the RandomizedSearchCV instance
random_search = RandomizedSearchCV(
    estimator=dtr,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='neg_mean_squared_error',  # Can change to other metrics
    cv=5,  # Number of folds in cross-validation
    random_state=42,
    verbose=1  # For logging output
)

# Fit the RandomizedSearchCV instance to the training data
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters:", random_search.best_params_)
print("Best score (negative MSE):", random_search.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 40, 'criterion': 'squared_error'}
Best score (negative MSE): -192.003362685875


In [26]:
# Retrieve the best model (retrained on the whole training set)
best_model = random_search.best_estimator_

# Optionally, evaluate the best model on the test set
y_pred = best_model.predict(X_test)
print("dtr_after_adjust:")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

dtr_after_adjust:
MAE: 6.8113937767173685
MSE: 274.82323724525304
RMSE: 16.57779349748491
R2: 0.9502323399955299
