In [1]:
import os
import json
import numpy as np
import pandas as pd

In [2]:
num_features = []
num_nonzero_features = []
mean_metrics = []
experiment_data = []

# Directory containing experiment JSON files
directory = 'dt_grid_search_results'
print("Starting to process JSON files...")

# Read and process each JSON file
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        with open(os.path.join(directory, filename), 'r') as f:
            data = json.load(f)
            feature_importance = data["feature_importances"]
            total_features = len(feature_importance)
            non_zero_features = sum(1 for v in feature_importance.values() if v > 0)
            mean_score = np.mean(data["cross_validation_scores"])
            
            num_features.append(total_features)
            num_nonzero_features.append(non_zero_features)
            mean_metrics.append(mean_score)
            experiment_data.append(data)  

print("Processing complete.")

Starting to process JSON files...
Processing complete.


In [3]:
# Calculate threshold for the top 10% of experiments
top_10_percent_threshold = np.percentile(mean_metrics, 90)
top_experiments_indices = np.where(np.array(mean_metrics) >= top_10_percent_threshold)[0]
top_experiments = [experiment_data[i] for i in top_experiments_indices]

# Dictionary to store feature scores
feature_scores = {}

# Sort top experiments by mean metrics in descending order
sorted_top_experiments = sorted(top_experiments, key=lambda x: np.mean(x['cross_validation_scores']), reverse=True)

In [4]:
sorted_top_experiments

[{'experiment_id': 184,
  'used_features': ['ch1_1Hz',
   'ch1_2Hz',
   'ch1_3Hz',
   'ch1_4Hz',
   'ch1_5Hz',
   'ch1_6Hz',
   'ch1_7Hz',
   'ch1_8Hz',
   'ch1_9Hz',
   'ch1_10Hz',
   'ch1_11Hz',
   'ch1_12Hz',
   'ch1_13Hz',
   'ch1_14Hz',
   'ch1_15Hz',
   'ch1_16Hz',
   'ch1_17Hz',
   'ch1_20Hz',
   'ch1_21Hz',
   'ch1_22Hz',
   'ch1_23Hz',
   'ch1_24Hz',
   'ch1_25Hz',
   'ch1_26Hz',
   'ch1_27Hz',
   'ch1_28Hz',
   'ch1_29Hz',
   'ch1_30Hz',
   'ch1_31Hz',
   'ch1_32Hz',
   'ch1_33Hz',
   'ch1_34Hz',
   'ch1_35Hz',
   'ch1_37Hz',
   'ch1_38Hz',
   'ch1_39Hz',
   'ch1_40Hz',
   'ch1_41Hz',
   'ch1_42Hz',
   'ch1_43Hz',
   'ch1_44Hz',
   'ch1_45Hz',
   'ch1_46Hz',
   'ch1_48Hz',
   'ch1_49Hz',
   'ch1_50Hz',
   'ch1_52Hz',
   'ch1_53Hz',
   'ch1_54Hz',
   'ch1_55Hz',
   'ch1_57Hz',
   'ch1_58Hz',
   'ch1_60Hz',
   'ch1_61Hz',
   'ch1_62Hz',
   'ch1_64Hz',
   'ch1_65Hz',
   'ch1_66Hz',
   'ch1_68Hz',
   'ch1_69Hz',
   'ch1_70Hz',
   'ch1_71Hz',
   'ch1_72Hz',
   'ch1_73Hz',
   'ch1_

In [5]:
len(sorted_top_experiments)

135

In [6]:
for rank, experiment in enumerate(sorted_top_experiments, start=1):
    mean_cv_score = np.mean(experiment['cross_validation_scores'])
    max_cv_score = max(experiment['cross_validation_scores'])
    num_non_zero_features = sum(1 for value in experiment['feature_importances'].values() if value > 0)
    num_features = len(experiment['feature_importances'])

    # Calculate B, C, and D values for the experiment
    B = mean_cv_score / max_cv_score
    C = num_non_zero_features / num_features
    D = 1 / rank

    # Iterate over each feature in the experiment
    for feature_rank, (feature, importance) in enumerate(experiment['feature_importances'].items(), start=1):
        # Calculate A and E values for the feature
        A = 1 if importance > 0 else 0
        E = 1 / feature_rank

        # Calculate the score for the feature
        score = A * ((B + C + D + E) / 4)

        # Update the score in the dictionary
        if feature in feature_scores:
            feature_scores[feature] += score
        else:
            feature_scores[feature] = score

In [7]:
final_ranking_df = pd.DataFrame(list(feature_scores.items()), columns=['Feature', 'Score'])
final_ranking_df = final_ranking_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

final_ranking_df

Unnamed: 0,Feature,Score
0,ch3_220Hz,3.335106
1,ch2_66Hz,3.113225
2,ch3_314Hz,3.023734
3,ch2_438Hz,2.913382
4,ch1_21Hz,2.654439
...,...,...
1344,ch3_172Hz,0.000000
1345,ch3_210Hz,0.000000
1346,ch3_246Hz,0.000000
1347,ch3_267Hz,0.000000


In [8]:
final_ranking_df.describe()

Unnamed: 0,Score
count,1349.0
mean,0.848732
std,0.551248
min,0.0
25%,0.491365
50%,0.783654
75%,1.200467
max,3.335106


In [25]:
final_ranking_df[final_ranking_df['Score'] <= 1.0]

Unnamed: 0,Feature,Score
522,ch2_414Hz,0.999625
523,ch1_143Hz,0.999425
524,ch2_408Hz,0.996819
525,ch3_356Hz,0.994502
526,ch1_356Hz,0.993765
...,...,...
1344,ch3_172Hz,0.000000
1345,ch3_210Hz,0.000000
1346,ch3_246Hz,0.000000
1347,ch3_267Hz,0.000000


In [9]:
final_ranking_df[final_ranking_df['Score'] > 1.0]

Unnamed: 0,Feature,Score
0,ch3_220Hz,3.335106
1,ch2_66Hz,3.113225
2,ch3_314Hz,3.023734
3,ch2_438Hz,2.913382
4,ch1_21Hz,2.654439
...,...,...
517,ch2_180Hz,1.005529
518,ch1_430Hz,1.003865
519,ch1_421Hz,1.003399
520,ch3_437Hz,1.001641


In [10]:
final_ranking_df[final_ranking_df['Score'] > 2.42]

Unnamed: 0,Feature,Score
0,ch3_220Hz,3.335106
1,ch2_66Hz,3.113225
2,ch3_314Hz,3.023734
3,ch2_438Hz,2.913382
4,ch1_21Hz,2.654439
5,ch3_408Hz,2.608823
6,ch3_340Hz,2.53927
7,ch3_6Hz,2.514012
8,ch2_366Hz,2.491762
9,ch1_288Hz,2.425496


In [27]:
final_ranking_df[final_ranking_df['Score'] >= 3.0]

Unnamed: 0,Feature,Score
0,ch3_220Hz,3.335106
1,ch2_66Hz,3.113225
2,ch3_314Hz,3.023734


In [37]:
import plotly.figure_factory as ff

ranks = final_ranking_df[final_ranking_df['Score'] > 0.0]["Score"].tolist()

fig = ff.create_distplot([ranks], group_labels=['Metric Values'], show_hist=True, show_curve=True, show_rug=False, bin_size=0.1)

fig.data[0].marker.color = 'blue'  
fig.data[1].line.color = 'red'     

fig.update_layout(
    title='Distribution of Rank among EMG features',
    title_x=0.46,
    xaxis_title='Channel-Frequency Rank',
    yaxis_title='Density',
    width=600,
    height=400,
)

fig.update_layout(showlegend=False)

fig.show()

In [29]:
df

Unnamed: 0,Feature,Score
0,ch3_220Hz,3.335106
1,ch2_66Hz,3.113225
2,ch3_314Hz,3.023734
3,ch2_438Hz,2.913382
4,ch1_21Hz,2.654439
5,ch3_408Hz,2.608823
6,ch3_340Hz,2.53927
7,ch3_6Hz,2.514012
8,ch2_366Hz,2.491762
9,ch1_288Hz,2.425496


In [44]:
import plotly.express as px

df = final_ranking_df[final_ranking_df['Score'] > 2.42].sort_values(by='Score', ascending=True)

fig = px.bar(df, x="Score", y="Feature", orientation='h')

fig.data[0].marker.color = 'blue' 

fig.update_layout(
    title='10 best-ranking features',
    title_x=0.46,
    xaxis_title='Channel-Frequency Rank',
    yaxis_title='Features',
    width=480,
    height=333,
)

fig.update_layout(showlegend=False)

fig.show()