In [2]:
import json
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})

import sys
sys.path.insert(0, '..')
from utils import create_dir, sample_grouped_train_data, grouped_train_test_split
np.set_printoptions(threshold=sys.maxsize, edgeitems=30, linewidth=100000, precision=2)

# Load configuration

In [3]:
with open('config_regression.json', 'r') as file:
    config = json.load(file)

raw_data = pd.read_csv(config['data_path'], sep=',')
des_var_names = config['design_variable_names']
inp_var_names = des_var_names + ['Wavevector'] + ['Mode']
feature_names = inp_var_names + ['Frequency']
n_mode = config['num_mode']
n_k = config['num_k']
n_des_vars = len(des_var_names)
group_size = n_mode * n_k

data_x = raw_data[inp_var_names].to_numpy()
data_y = raw_data['Frequency'].to_numpy()

FileNotFoundError: [Errno 2] No such file or directory: 'band_info_2D.csv'

# Train/Test split

In [3]:
folder_name = 'results_rf_regression'

create_dir(folder_name)

# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, shuffle=False, random_state=0)
x_train, y_train, x_test, y_test, train_rows, test_rows = grouped_train_test_split(data_x, data_y, train_fraction=0.8, group_size=group_size)
print('# designs for training:', x_train.shape[0])
print('# designs for testing:', x_test.shape[0])
n_inp = len(inp_var_names)
x_train = x_train.reshape(-1, n_inp)
x_test = x_test.reshape(-1, n_inp)
y_train = y_train.flatten()
y_test = y_test.flatten()

# Get ranges of design variables
des_var_bounds = np.vstack([np.min(x_train[:,:-2], axis=0), np.max(x_train[:,:-2], axis=0)]).T
np.save(f'{folder_name}/des_var_bounds.npy', des_var_bounds)

# designs for training: 366000
# designs for testing: 91500


# Hyperparameter tuning (max tree depth)

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
# import pandas as pd
# from joblib import dump 

# depths = [10, 12, 15, 20, 25, 50, 100]
# train_mape_list = []
# test_mape_list = []
# train_mse_list = []
# test_mse_list = []

# for d in depths:
#     model = RandomForestRegressor(
#         n_estimators=config['num_estimators'],
#         criterion='squared_error',
#         max_depth=d,
#         min_samples_split=2,
#         min_samples_leaf=1,
#         random_state=0
#     )
#     model.fit(x_train, y_train)

#     y_pred_train = model.predict(x_train)
#     y_pred_test = model.predict(x_test)

#     # MAPE
#     mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
#     mape_test = mean_absolute_percentage_error(y_test, y_pred_test)

#     # MSE
#     mse_train = mean_squared_error(y_train, y_pred_train)
#     mse_test = mean_squared_error(y_test, y_pred_test)

#     train_mape_list.append(mape_train)
#     test_mape_list.append(mape_test)
#     train_mse_list.append(mse_train)
#     test_mse_list.append(mse_test)

#     print(f"max_depth={d}, "
#           f"Train MAPE={mape_train:.4f}, Test MAPE={mape_test:.4f}, "
#           f"Train MSE={mse_train:.4f}, Test MSE={mse_test:.4f}")

#     # dump(model, f'{folder_name}/forest_depth_{d}.joblib')

# results_df = pd.DataFrame({
#     "max_depth": depths,
#     "train_mape": train_mape_list,
#     "test_mape": test_mape_list,
#     "train_mse": train_mse_list,
#     "test_mse": test_mse_list
# })
# results_df.to_csv("rf_depth.csv", index=False)
# print("Tree depth tuning result saved to rf_depth.csv")

max_depth=10, Train MAPE=0.0242, Test MAPE=0.0275, Train MSE=1.6857, Test MSE=2.2229
max_depth=12, Train MAPE=0.0149, Test MAPE=0.0194, Train MSE=0.7043, Test MSE=1.2255
max_depth=15, Train MAPE=0.0065, Test MAPE=0.0129, Train MSE=0.1594, Test MSE=0.6395
max_depth=20, Train MAPE=0.0016, Test MAPE=0.0102, Train MSE=0.0108, Test MSE=0.4839
max_depth=25, Train MAPE=0.0009, Test MAPE=0.0100, Train MSE=0.0035, Test MSE=0.4772
max_depth=50, Train MAPE=0.0009, Test MAPE=0.0100, Train MSE=0.0033, Test MSE=0.4774
max_depth=100, Train MAPE=0.0009, Test MAPE=0.0100, Train MSE=0.0033, Test MSE=0.4774
Tree depth tuning result saved to rf_depth.csv


# Final model

## Training

In [4]:
# Train a random forest
start_time = time.time()
reg = RandomForestRegressor(n_estimators=config['num_estimators'], criterion='squared_error',
                            max_depth = 20, min_samples_split=2, min_samples_leaf=1, random_state=0)
reg = reg.fit(x_train, y_train)
training_time = time.time() - start_time
print('Training time: {:.2f}s'.format(training_time))

# class_weight='balanced'

Training time: 59.71s


## Performance evaluation

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import numpy as np

# Evaluate the model on training data
y_pred_train = reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Evaluate the model on test data
y_pred_test = reg.predict(x_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print results
print('Train MSE:', mse_train)
print('Train MAE:', mae_train)
print('Train MAPE:', mape_train)
print('Train R2:', r2_train)
print('Test MSE:', mse_test)
print('Test MAE:', mae_test)
print('Test MAPE:', mape_test)
print('Test R2:', r2_test)

# Save results to file
lines = [
    'Training Time: {:.2f}s'.format(training_time),   # ðŸ‘ˆ åŠ ä¸Šè®­ç»ƒæ—¶é—´
    'Train MSE: {:.4f}'.format(mse_train),
    'Train MAE: {:.4f}'.format(mae_train),
    'Train MAPE: {:.4f}'.format(mape_train),
    'Train R2: {:.4f}'.format(r2_train),
    'Test MSE: {:.4f}'.format(mse_test),
    'Test MAE: {:.4f}'.format(mae_test),
    'Test MAPE: {:.4f}'.format(mape_test),
    'Test R2: {:.4f}'.format(r2_test)
]

with open(f'{folder_name}/forest_regression_acc.txt', 'w') as f:
    f.write('\n'.join(lines))


Train MSE: 0.010792286268588072
Train MAE: 0.06469141636542966
Train MAPE: 0.0015859390131624945
Train R2: 0.9999741538684452
Test MSE: 0.48387141808004325
Test MAE: 0.43845454234644915
Test MAPE: 0.010195670560847473
Test R2: 0.9988311004399985


## Model saving

In [6]:
# Save the model
dump(reg, f'{folder_name}/forest.joblib')

['results_rf_regression/forest.joblib']

# Plot Decision tree

Time-consuming when tree is large

In [None]:
# from sklearn.tree import export_graphviz
# import graphviz
# from IPython.display import display

# tree = reg.estimators_[2]

# dot_data = export_graphviz(tree, out_file=None, 
#                            feature_names=inp_var_names,        
#                            filled=True, rounded=True, 
#                            special_characters=True)  

# graph = graphviz.Source(dot_data)  

# graph.render("decision_tree", format="pdf", cleanup=True)  

# # display(graph)
