In [10]:
import os
import json
import numpy as np
import plotly.graph_objects as go
from scipy.stats import sem, t
from sklearn.linear_model import LinearRegression

In [11]:
def load_jet_colorscale():
    return [
        [0.0, 'rgb(0,0,131)'], [0.125, 'rgb(0,60,170)'], [0.25, 'rgb(5,255,255)'],
        [0.375, 'rgb(0,220,60)'], [0.5, 'rgb(255,255,0)'], [0.625, 'rgb(255,130,0)'],
        [0.75, 'rgb(255,0,0)'], [0.875, 'rgb(128,0,0)'], [1.0, 'rgb(128,0,0)']
    ]

num_features = []
num_nonzero_features = []
mean_metrics = []

directory = 'dt_grid_search_results'

In [12]:
print("Starting to process JSON files...")
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        with open(os.path.join(directory, filename), 'r') as f:
            data = json.load(f)
            feature_importance = data["feature_importances"]
            total_features = len(feature_importance)
            non_zero_features = sum(1 for v in feature_importance.values() if v > 0)
            mean_score = np.mean(data["cross_validation_scores"])
            num_features.append(total_features)
            num_nonzero_features.append(non_zero_features)
            mean_metrics.append(mean_score)
        print(f"Processed {filename}: Total Features={total_features}, Non-Zero Features={non_zero_features}, Mean Score={mean_score}")

Starting to process JSON files...
Processed experiment_740.json: Total Features=610, Non-Zero Features=53, Mean Score=0.42031950126631595
Processed experiment_1290.json: Total Features=60, Non-Zero Features=19, Mean Score=0.40940970192869663
Processed experiment_310.json: Total Features=1040, Non-Zero Features=18, Mean Score=0.414767192674849
Processed experiment_83.json: Total Features=1267, Non-Zero Features=23, Mean Score=0.4146697837521917
Processed experiment_605.json: Total Features=745, Non-Zero Features=6, Mean Score=0.42031950126631595
Processed experiment_255.json: Total Features=1095, Non-Zero Features=27, Mean Score=0.42577440093512564
Processed experiment_486.json: Total Features=864, Non-Zero Features=35, Mean Score=0.43756088057666087
Processed experiment_1156.json: Total Features=194, Non-Zero Features=33, Mean Score=0.40395480225988695
Processed experiment_193.json: Total Features=1157, Non-Zero Features=9, Mean Score=0.42041691018897326
Processed experiment_1013.json:

In [13]:
sorted_indices = np.argsort(num_features)
num_features_sorted = np.array(num_features)[sorted_indices]
mean_metrics_sorted = np.array(mean_metrics)[sorted_indices]

X = num_features_sorted.reshape(-1, 1)
y = mean_metrics_sorted
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)
confidence = 0.95
n = len(y)
mean_x = np.mean(X)
t_value = t.ppf((1 + confidence) / 2., n - 1)

y_err = y - y_pred
s_err = np.sqrt(np.sum(y_err**2) / (n - 2))
conf_interval = t_value * s_err * np.sqrt(1/n + (X - mean_x)**2 / np.sum((X - mean_x)**2))
upper_bound = y_pred + conf_interval.flatten()
lower_bound = y_pred - conf_interval.flatten()

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=num_features_sorted, y=mean_metrics_sorted, mode='markers', marker=dict(color='blue'), name='Data Points'))
fig1.add_trace(go.Scatter(x=num_features_sorted, y=y_pred, mode='lines', line=dict(color='red'), name='Regression Line'))
fig1.add_trace(go.Scatter(x=np.concatenate([num_features_sorted, num_features_sorted[::-1]]),
                          y=np.concatenate([upper_bound, lower_bound[::-1]]),
                          fill='toself', fillcolor='rgba(255, 0, 0, 0.2)', line=dict(color='rgba(255, 0, 0, 0)'), 
                          opacity=0.5, showlegend=False))

fig1.update_layout(title='Number of Features in Decision Tree vs Classification Report', title_x=0.46, xaxis_title='Number of Features in Decision Tree', yaxis_title='Weighted F1 Average', width=1400, height=700)
fig1.show()

In [14]:
assert len(num_features) == len(set(num_features))

In [15]:
sorted_indices = np.argsort(num_nonzero_features)
num_nonzero_features_sorted = np.array(num_nonzero_features)[sorted_indices]
mean_metrics_sorted = np.array(mean_metrics)[sorted_indices]

X = num_nonzero_features_sorted.reshape(-1, 1)
y = mean_metrics_sorted
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)

confidence = 0.95
n = len(y)
mean_x = np.mean(X)
t_value = t.ppf((1 + confidence) / 2., n - 1)
y_err = y - y_pred
s_err = np.sqrt(np.sum(y_err**2) / (n - 2))
conf_interval = t_value * s_err * np.sqrt(1/n + (X - mean_x)**2 / np.sum((X - mean_x)**2))

upper_bound = y_pred + conf_interval.flatten()
lower_bound = y_pred - conf_interval.flatten()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=num_nonzero_features_sorted, y=mean_metrics_sorted, mode='markers', marker=dict(color='blue'), name='Data Points'))
fig2.add_trace(go.Scatter(x=num_nonzero_features_sorted, y=y_pred, mode='lines', line=dict(color='red'), name='Regression Line'))
fig2.add_trace(go.Scatter(x=np.concatenate([num_nonzero_features_sorted, num_nonzero_features_sorted[::-1]]),
                          y=np.concatenate([upper_bound, lower_bound[::-1]]),
                          fill='toself', fillcolor='rgba(255, 0, 0, 0.2)', line=dict(color='rgba(255, 0, 0, 0)'), 
                          opacity=0.5, showlegend=False))

fig2.update_layout(title='Number of Informative Features in Decision Tree vs Classification Report', title_x=0.46, 
                   xaxis_title='Number of Informative Features in Decision Tree', yaxis_title='Weighted F1 Average', 
                   width=1400, height=700)
fig2.show()

In [16]:
import plotly.figure_factory as ff

fig = ff.create_distplot([mean_metrics], group_labels=['Metric Values'], show_hist=True, show_curve=True, show_rug=False, bin_size=0.006)

fig.data[0].marker.color = 'blue'  
fig.data[1].line.color = 'red'     

fig.update_layout(
    title='Distribution of Weighted F1',
    title_x=0.46,
    xaxis_title='Weighted F1 Average Value',
    yaxis_title='Density',
    width=1400,
    height=700
)

fig.show()

In [17]:
fig_box = go.Figure()
fig_box.add_trace(go.Box(x=mean_metrics, boxpoints='all', jitter=0.3, pointpos=0, fillcolor='rgba(0, 0, 255, 0.3)', line=dict(color='blue'), name='Metric'))

fig_box.update_layout(
    title='Box Plot of Weighted F1',
    title_x=0.5,
    xaxis_title='Weighted F1 Average Value',
    width=1300,
    height=400
)

fig_box.show()

In [18]:
from scipy.stats import kstest, shapiro, anderson
import numpy as np

mean_metrics_array = np.array(mean_metrics)
ks_statistic, ks_p_value = kstest(mean_metrics_array, 'norm', args=(mean_metrics_array.mean(), mean_metrics_array.std()))
print("Kolmogorov-Smirnov Test:")
print(f"Statistic: {ks_statistic}, P-value: {ks_p_value}")
if ks_p_value < 0.05:
    print("Result: Reject the null hypothesis - Data is not normally distributed.")
else:
    print("Result: Fail to reject the null hypothesis - Data may be normally distributed.")
print("\n")

sw_statistic, sw_p_value = shapiro(mean_metrics_array)
print("Shapiro-Wilk Test:")
print(f"Statistic: {sw_statistic}, P-value: {sw_p_value}")
if sw_p_value < 0.05:
    print("Result: Reject the null hypothesis - Data is not normally distributed.")
else:
    print("Result: Fail to reject the null hypothesis - Data may be normally distributed.")
print("\n")

ad_result = anderson(mean_metrics_array, dist='norm')
print("Anderson-Darling Test:")
print(f"Statistic: {ad_result.statistic}")
for i, significance_level in enumerate(ad_result.significance_level):
    print(f"At {significance_level}% significance level: Critical Value = {ad_result.critical_values[i]}")
    if ad_result.statistic > ad_result.critical_values[i]:
        print(f"Result: Reject the null hypothesis at {significance_level}% significance level - Data is not normally distributed.")
    else:
        print(f"Result: Fail to reject the null hypothesis at {significance_level}% significance level - Data may be normally distributed.")

Kolmogorov-Smirnov Test:
Statistic: 0.07610474171764503, P-value: 3.0532071880019324e-07
Result: Reject the null hypothesis - Data is not normally distributed.


Shapiro-Wilk Test:
Statistic: 0.9868150381472779, P-value: 1.0350664333715496e-09
Result: Reject the null hypothesis - Data is not normally distributed.


Anderson-Darling Test:
Statistic: 6.120383445608013
At 15.0% significance level: Critical Value = 0.574
Result: Reject the null hypothesis at 15.0% significance level - Data is not normally distributed.
At 10.0% significance level: Critical Value = 0.654
Result: Reject the null hypothesis at 10.0% significance level - Data is not normally distributed.
At 5.0% significance level: Critical Value = 0.785
Result: Reject the null hypothesis at 5.0% significance level - Data is not normally distributed.
At 2.5% significance level: Critical Value = 0.915
Result: Reject the null hypothesis at 2.5% significance level - Data is not normally distributed.
At 1.0% significance level: Cri