# Analysis Tuning Parameters

## Imports

In [11]:
import pandas as pd
import os

# solo
mtry_df = pd.read_csv('resources/results_solo_grid/mtry.csv')
num_trees_df = pd.read_csv('resources/results_solo_grid/num_trees.csv')
min_node_size_df = pd.read_csv('resources/results_solo_grid/min_node_size.csv')
replace_df = pd.read_csv('resources/results_solo_grid/replace.csv')
sample_fraction_df = pd.read_csv('resources/results_solo_grid/sample_fraction.csv')

# pairwise
directory = "resources/results_pairwise_random"
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        # Remove the .csv from the filename to use as the variable name
        var_name = filename[:-4] + '_df'
        # Load the dataframe and assign it to a variable with the name in var_name
        df = pd.read_csv(filepath)
        # Drop the 'Unnamed: 0' column
        df = df.drop('Unnamed: 0', axis=1)
        globals()[var_name] = df

# all 5
results_all_bayesian_df = pd.read_csv('resources/results_all_search_algo/tuning_results.csv')
results_all_random_df = pd.read_csv('resources/results_all_random/results.csv')

# rename columns '_' --> '.' 
column_mapping = {col: col.replace('_', '.') for col in results_all_random_df.columns if col not in ['runtime_training', 'runtime_prediction']}
results_all_random_df.rename(columns=column_mapping, inplace=True)
results_all_random_df.columns

Index(['Unnamed: 0', 'num.trees', 'sample.fraction', 'mtry', 'replace',
       'min.node.size', 'mse', 'runtime_training', 'runtime_prediction'],
      dtype='object')

## Solo

### MSE

In [None]:
import plotly.graph_objects as go

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=mtry_df['mtry'], y=mtry_df['mse'], mode='lines', name='mse'))
fig1.update_layout(title='mtry vs mse', xaxis_title='mtry', yaxis_title='mse')
# fig1.show()
fig1.write_image('resources/results_solo_grid/pictures/mtry_vs_mse.png')

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=num_trees_df['num_trees'], y=num_trees_df['mse'], mode='lines', name='mse'))
fig2.update_layout(title='num.trees vs mse', xaxis_title='num.trees', yaxis_title='mse')
# fig2.show()
fig2.write_image('resources/results_solo_grid/pictures/num_trees_vs_mse.png')

fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=min_node_size_df['min.node.size'], y=min_node_size_df['mse'], mode='lines', name='mse'))
fig3.update_layout(title='min.node.size vs mse', xaxis_title='min.node.size', yaxis_title='mse')
# fig3.show()
fig3.write_image('resources/results_solo_grid/pictures/min_node_size_vs_mse.png')

fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=sample_fraction_df['sample.fraction'], y=sample_fraction_df['mse'], mode='lines', name='mse'))
fig4.update_layout(title='sample.fraction vs mse', xaxis_title='sample.fraction', yaxis_title='mse')
# fig4.show()
fig4.write_image('resources/results_solo_grid/pictures/sample_fraction_vs_mse.png')

fig5 = go.Figure()
fig5.add_trace(go.Bar(x=replace_df['replace'], y=replace_df['mse']))
fig5.update_layout(title='replace vs mse', xaxis_title='replace', yaxis_title='mse')
# fig5.show()
fig5.write_image('resources/results_solo_grid/pictures/replace_vs_mse.png')


### Runtime training & prediction

In [None]:
import plotly.graph_objects as go


fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=mtry_df['mtry'], y=mtry_df['runtime_training'], mode='lines', name='runtime_training'))
fig1.add_trace(go.Scatter(x=mtry_df['mtry'], y=mtry_df['runtime_prediction'], mode='lines', name='runtime_prediction'))
fig1.update_layout(title='mtry vs runtime', xaxis_title='mtry', yaxis_title='runtime')
# fig1.show()
# fig1.write_image("resources/results_solo_grid/pictures/mtry_runtime.png")


fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=num_trees_df['num_trees'], y=num_trees_df['runtime_training'], mode='lines', name='runtime_training'))
fig2.add_trace(go.Scatter(x=num_trees_df['num_trees'], y=num_trees_df['runtime_prediction'], mode='lines', name='runtime_prediction'))
fig2.update_layout(title='num.trees vs runtime', xaxis_title='num.trees', yaxis_title='runtime')
# fig2.show()
# fig2.write_image("resources/results_solo_grid/pictures/num_trees_runtime.png")


fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=min_node_size_df['min.node.size'], y=min_node_size_df['runtime_training'], mode='lines', name='runtime_training'))
fig3.add_trace(go.Scatter(x=min_node_size_df['min.node.size'], y=min_node_size_df['runtime_prediction'], mode='lines', name='runtime_prediction'))
fig3.update_layout(title='min.node.size vs runtime', xaxis_title='min.node.size', yaxis_title='runtime')
# fig3.show()
# fig3.write_image("resources/results_solo_grid/pictures/min_node_size_runtime.png")


fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=sample_fraction_df['sample.fraction'], y=sample_fraction_df['runtime_training'], mode='lines', name='runtime_training'))
fig4.add_trace(go.Scatter(x=sample_fraction_df['sample.fraction'], y=sample_fraction_df['runtime_prediction'], mode='lines', name='runtime_prediction'))
fig4.update_layout(title='sample.fraction vs runtime', xaxis_title='sample.fraction', yaxis_title='runtime')
# fig4.show()
# fig4.write_image("resources/results_solo_grid/pictures/sample_fraction_runtime.png")


fig5 = go.Figure()
fig5.add_trace(go.Bar(x=replace_df['replace'], y=replace_df['runtime_training'], name='runtime_training'))
fig5.add_trace(go.Bar(x=replace_df['replace'], y=replace_df['runtime_prediction'], name='runtime_prediction'))
fig5.update_layout(title='replace vs runtime', xaxis_title='replace', yaxis_title='runtime')
# fig5.show()
# fig5.write_image("resources/results_solo_grid/pictures/replace_runtime.png")


### Sensitivity MSE

In [26]:
normalized_mtry_df = mtry_df.copy()
normalized_mtry_df['mtry'] /= normalized_mtry_df['mtry'].max()

normalized_num_trees_df = num_trees_df.copy()
normalized_num_trees_df['num_trees'] /= normalized_num_trees_df['num_trees'].max()

normalized_min_node_size_df = min_node_size_df.copy()
normalized_min_node_size_df['min.node.size'] /= normalized_min_node_size_df['min.node.size'].max()

normalized_sample_fraction_df = sample_fraction_df.copy()
normalized_sample_fraction_df['sample.fraction'] /= normalized_sample_fraction_df['sample.fraction'].max()

normalized_replace_df = replace_df.copy()
# normalized_replace_df['replace'] /= normalized_replace_df['replace'].max() #

# Combine all DataFrames into one for plotting
combined_df = pd.concat([normalized_mtry_df, normalized_num_trees_df, normalized_min_node_size_df, normalized_sample_fraction_df, normalized_replace_df])

In [28]:
# Create a Figure
fig = go.Figure()

# Add traces for each series
for col in combined_df.columns:
    if col not in ("mse", "runtime_training", "runtime_prediction"):
        fig.add_trace(go.Scatter(x=combined_df[col], y=combined_df['mse'], mode='lines', name=col))

# Update layout
fig.update_layout(
    title='Hyperparameters vs mse',
    xaxis_title='Hyperparameters',
    yaxis_title='mse',
    legend_title='Hyperparameters'
)

# Show the figure
fig.show()

In [27]:
# Create a Figure
fig = go.Figure()

# Add traces for each series
for col in combined_df.columns:
    if col not in ("mse", "runtime_training", "runtime_prediction"):
        fig.add_trace(go.Scatter(x=combined_df[col], y=combined_df['runtime_training'], mode='lines', name=col))

# Update layout
fig.update_layout(
    title='Hyperparameters vs Training Runtime',
    xaxis_title='Hyperparameters',
    yaxis_title='mse',
    legend_title='Hyperparameters'
)

# Show the figure
fig.show()

In [29]:
# Create a Figure
fig = go.Figure()

# Add traces for each series
for col in combined_df.columns:
    if col not in ("mse", "runtime_training", "runtime_prediction"):
        fig.add_trace(go.Scatter(x=combined_df[col], y=combined_df['runtime_prediction'], mode='lines', name=col))

# Update layout
fig.update_layout(
    title='Hyperparameters vs Prediction Runtime',
    xaxis_title='Hyperparameters',
    yaxis_title='mse',
    legend_title='Hyperparameters'
)

# Show the figure
fig.show()

## Pairwise

### mse

In [None]:
import plotly.graph_objects as go

df_names = ['num_trees_min_node_size_df', 'num_trees_replace_df', 'num_trees_mtry_df', 'num_trees_sample_fraction_df', 'mtry_sample_fraction_df', 'mtry_replace_df', 'mtry_min_node_size_df', 'min_node_size_replace_df', 'sample_fraction_min_node_size_df', 'sample_fraction_replace_df']

for df_name in df_names:
    df = globals()[df_name]
    # Select the first 5 columns and drop columns that only contain NaN
    df1 = df[df.columns[:6]].dropna(axis=1, how='all')
    df2 = df1[df1.columns[:2]]
    # Get the column names for the hyperparameters
    hyperparameter1, hyperparameter2 = df2.columns

    # 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=df[hyperparameter1],
        y=df[hyperparameter2],
        z=df['mse'],
        mode='markers',
        marker=dict(
            size=4,
            color=df['mse'],  
            colorscale='Viridis',  
            opacity=0.8
        )
    )])

    
    fig.update_layout(scene = dict(
                    xaxis_title=hyperparameter1,
                    yaxis_title=hyperparameter2,
                    zaxis_title='MSE'),
                    width=700,
                    margin=dict(r=20, b=10, l=10, t=10))

    # fig.show()
    #export as svg:
    fig.write_html(f"resources/results_pairwise_random/pictures/{df_name}.html")

### runtime training

In [None]:
import plotly.graph_objects as go

df_names = ['num_trees_min_node_size_df', 'num_trees_replace_df', 'num_trees_mtry_df', 'num_trees_sample_fraction_df', 'mtry_sample_fraction_df', 'mtry_replace_df', 'mtry_min_node_size_df', 'min_node_size_replace_df', 'sample_fraction_min_node_size_df', 'sample_fraction_replace_df']

for df_name in df_names:
    df = globals()[df_name]
    # Select the first 5 columns and drop columns that only contain NaN
    df1 = df[df.columns[:6]].dropna(axis=1, how='all')
    df2 = df1[df1.columns[:2]]
    # Get the column names for the hyperparameters
    hyperparameter1, hyperparameter2 = df2.columns

    # 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=df[hyperparameter1],
        y=df[hyperparameter2],
        z=df['runtime_training'],
        mode='markers',
        marker=dict(
            size=4,
            color=df['runtime_training'],  
            colorscale='Viridis',  
            opacity=0.8
        )
    )])

    
    fig.update_layout(scene = dict(
                    xaxis_title=hyperparameter1,
                    yaxis_title=hyperparameter2,
                    zaxis_title='Runtime Training'),
                    width=700,
                    margin=dict(r=20, b=10, l=10, t=10))

    # fig.show()
    fig.write_html(f"resources/results_pairwise_random/pictures/runtime_training/{df_name}.html")

### runtime prediction

In [None]:
import plotly.graph_objects as go

df_names = ['num_trees_min_node_size_df', 'num_trees_replace_df', 'num_trees_mtry_df', 'num_trees_sample_fraction_df', 'mtry_sample_fraction_df', 'mtry_replace_df', 'mtry_min_node_size_df', 'min_node_size_replace_df', 'sample_fraction_min_node_size_df', 'sample_fraction_replace_df']

for df_name in df_names:
    df = globals()[df_name]
    # Select the first 5 columns and drop columns that only contain NaN
    df1 = df[df.columns[:6]].dropna(axis=1, how='all')
    df2 = df1[df1.columns[:2]]
    # Get the column names for the hyperparameters
    hyperparameter1, hyperparameter2 = df2.columns

    # 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=df[hyperparameter1],
        y=df[hyperparameter2],
        z=df['runtime_prediction'],
        mode='markers',
        marker=dict(
            size=4,
            color=df['runtime_prediction'],  
            colorscale='Viridis',  
            opacity=0.8
        )
    )])

    
    fig.update_layout(scene = dict(
                    xaxis_title=hyperparameter1,
                    yaxis_title=hyperparameter2,
                    zaxis_title='Runtime Prediction'),
                    width=700,
                    margin=dict(r=20, b=10, l=10, t=10))

    # fig.show()
    fig.write_html(f"resources/results_pairwise_random/pictures/runtime_prediction/{df_name}.html")

## All

### Randomsearch

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.preprocessing import MinMaxScaler
import numpy as np

df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'mse']]

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'mse', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')

# Color scale bar
sm = plt.cm.ScalarMappable(cmap='RdBu', norm=plt.Normalize(vmin=min(df['mse']), vmax=max(df['mse'])))
plt.colorbar(sm, ax=plt.gca())
# remove legend
plt.legend().remove()
plt.show()
# plt.savefig('resources/results_all_random/parallel_coordinates_plot.png')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'mse']]

# Filter the best 100 mse scores
df = df.nsmallest(100, 'mse')

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'mse', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Top 100 Hyperparameter Combinations')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'mse']]

# Filter the best 50 mse scores
df_best = df.nsmallest(100, 'mse')

# Filter the worst 50 mse scores
df_worst = df.nlargest(100, 'mse')

# Concatenate the best and worst dataframes
df_combined = pd.concat([df_best, df_worst])

scaler = MinMaxScaler()

# Normalize the combined dataframe
df_normalized = pd.DataFrame(scaler.fit_transform(df_combined), columns=df_combined.columns)

# Parallel Coordinates Plot for the best and worst 100
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'mse', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Top and Worst 100 Hyperparameter Combinations regarding MSE')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

### Runtime training

In [None]:
df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'runtime_training']]

scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'runtime_training', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations - Runtime Training')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'runtime_training']]

# Filter the best 50 mse scores
df_best = df.nsmallest(100, 'runtime_training')

# Filter the worst 50 mse scores
df_worst = df.nlargest(100, 'runtime_training')

# Concatenate the best and worst dataframes
df_combined = pd.concat([df_best, df_worst])

scaler = MinMaxScaler()

# Normalize the combined dataframe
df_normalized = pd.DataFrame(scaler.fit_transform(df_combined), columns=df_combined.columns)

# Parallel Coordinates Plot for the best and worst 100
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'runtime_training', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Top and Worst 100 Hyperparameter Combinations regarding Runtime Training')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

### Runtime Prediction

In [None]:
df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'runtime_prediction']]

scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'runtime_prediction', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations - Runtime Prediction')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_random_df[['num_trees', 'mtry', 'sample_fraction', 'min_node_size', 'replace', 'runtime_prediction']]

# Filter the best 50 mse scores
df_best = df.nsmallest(100, 'runtime_prediction')

# Filter the worst 50 mse scores
df_worst = df.nlargest(100, 'runtime_prediction')

# Concatenate the best and worst dataframes
df_combined = pd.concat([df_best, df_worst])

scaler = MinMaxScaler()

# Normalize the combined dataframe
df_normalized = pd.DataFrame(scaler.fit_transform(df_combined), columns=df_combined.columns)

# Parallel Coordinates Plot for the best and worst 100
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'runtime_prediction', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Top and Worst 100 Hyperparameter Combinations regarding Runtime Prediction')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

### Bayesian Optimization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_bayesian_df[['num.trees', 'mtry', 'sample.fraction', 'min.node.size', 'replace', 'mse']]

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'mse', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_bayesian_df[['num.trees', 'mtry', 'sample.fraction', 'min.node.size', 'replace', 'training_time']]

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'training_time', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations regarding Runtime Training')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = results_all_bayesian_df[['num.trees', 'mtry', 'sample.fraction', 'min.node.size', 'replace', 'prediction_time']]

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Parallel Coordinates Plot 
plt.figure(figsize=(15, 6))
pd.plotting.parallel_coordinates(df_normalized, 'prediction_time', colormap='RdBu')
plt.title('Parallel Coordinates Plot - Hyperparameter Combinations regarding Runtime Predcition')
plt.xlabel('Hyperparameters')
plt.ylabel('Relative Value')
plt.legend().remove()
plt.show()

### Pareto

In [19]:
# merge results_bayes and random

df_bayes_temp = results_all_bayesian_df.drop(columns=["Round"])
df_bayes_temp.rename(columns={"training_time": "runtime_training", "prediction_time": "runtime_prediction"}, inplace=True)
df_random_temp = results_all_random_df.drop(columns=["Unnamed: 0"])
df_results_all = pd.concat([df_bayes_temp, df_random_temp], axis=0)
df_results_all

(560, 8)


In [22]:
import plotly.graph_objects as go

# calculate pareto front
def is_pareto_front(row):
    # Check if there is no other row with lower 'mse' and lower 'runtime_prediction'
    return not df_results_all[(df_results_all['mse'] < row['mse']) & (df_results_all['runtime_prediction'] < row['runtime_prediction'])].empty
df_results_all['pareto_front'] = df_results_all.apply(is_pareto_front, axis=1)

# plot pareto front
pareto_trace = go.Scatter(
    x=df_results_all[df_results_all['pareto_front'] == True]['mse'],
    y=df_results_all[df_results_all['pareto_front'] == True]['runtime_prediction'],
    mode='markers',
    marker=dict(color='red'),
    name='Pareto Front'
)

# Create a scatter plot for the rest of the points
other_points_trace = go.Scatter(
    x=df_results_all[df_results_all['pareto_front'] == False]['mse'],
    y=df_results_all[df_results_all['pareto_front'] == False]['runtime_prediction'],
    mode='markers',
    marker=dict(color='blue'),
    name='Other Points'
)

# Combine the traces into a Figure
fig = go.Figure(data=[pareto_trace, other_points_trace])

# Customize the layout
fig.update_layout(
    title='mse vs runtime_prediction',
    xaxis_title='mse',
    yaxis_title='runtime_prediction',
    legend_title='Points Type'
)

# Show the figure
fig.show()