In [None]:
import os
import pandas as pd
import re

In [None]:
# Function to process results directory and return a pivot table with feature importance
def process_results_directory(root_dir, meaningful_parts):
    all_data = []  # List to store all data
    
    # Walk through the directory tree
    for subdir, dirs, files in os.walk(root_dir):
        # Check if subdir contains 'ABS' or 'MutualInformation'
        if 'ABS' in subdir or 'MutualInformation' in subdir:
            if subdir != root_dir:  # Ensure not processing the root directory
                method = subdir.split(os.sep)[-1].replace('ABS_', '')  # Extract method name from subdir
                subdir_data = []  # List to store data for current subdir
                for file in files:
                    if file.endswith('.txt'):  # Process only .txt files
                        file_path = os.path.join(subdir, file)  # Construct file path
                        try:
                            data = pd.read_csv(file_path, delimiter='\t')  # Read data from file
                            m_data = data[data['Feature'].str.startswith('M')]  # Filter data for features starting with 'M'
                            importance_cols = [col for col in data.columns if "Feature_Importance" in col]  # Find importance columns

                            # Iterate over filtered data and importance columns
                            for index, row in m_data.iterrows():
                                for col in importance_cols:
                                    importance_type = col.replace('Feature_Importance', '').strip()  # Extract importance type
                                    method_column_name = f"{method}_{importance_type}".rstrip('_')  # Create method column name
                                    entry = {
                                        'Feature': row['Feature'],  # Feature name
                                        'Method': method_column_name,  # Method and importance type
                                        'Importance': row[col],  # Importance value
                                        **meaningful_parts  # Add meaningful parts
                                    }
                                    subdir_data.append(entry)  # Add entry to subdir_data
                        except Exception as e:
                            print(f"Error reading {file_path}: {e}")  # Print error if reading fails

                if subdir_data:
                    all_data.extend(subdir_data)  # Add subdir data to all_data

    if all_data:
        results_df = pd.DataFrame(all_data)  # Create DataFrame from all_data
        pivot_df = results_df.pivot_table(index=['Feature', *meaningful_parts.keys()], columns='Method', values='Importance', aggfunc='mean').reset_index()
        # Get the current non-Method columns
        non_method_cols = ['Feature', *meaningful_parts.keys()]
        # Get the current Method columns and sort them as required
        method_cols = [col for col in pivot_df.columns if col not in non_method_cols]
        sorted_method_cols = ['ReliefF10', 'ReliefF10_ABS', 'ReliefF', 'ReliefF_ABS', 'MultiSURF', 'MultiSURF_ABS', 'MultiSURFstar', 'MultiSURFstar_ABS', 'MutualInformation']
        # Combine the non-Method columns with the sorted Method columns
        pivot_df = pivot_df[non_method_cols + sorted_method_cols]
        return pivot_df  # Return the pivot table
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no data

In [None]:
def extract_numeric_parts(s):
    """Extract numeric parts from a string and return as a tuple of integers."""
    # Use regular expression to find all sequences of digits in the string
    # Convert each sequence of digits to an integer and return them as a tuple
    return tuple(map(int, re.findall(r'\d+', s)))

In [None]:
def find_and_process_results(start_dir):
    master_df = pd.DataFrame()  # Initialize an empty DataFrame to store the master results
    for root, dirs, files in os.walk(start_dir):  # Walk through the directory tree
        if 'Results' in dirs:  # Check if 'Results' directory exists in the current directory
            path_parts = os.path.relpath(root, start_dir).split(os.sep)  # Get the relative path parts
            meaningful_parts = {}  # Dictionary to store meaningful parts
            for part in path_parts:  # Iterate over path parts - will need to update this for new descriptors if needed
                if 'xor_' in part:
                    meaningful_parts['X1'] = part  # Order of epistasis with additional descriptors
                elif 'a_' in part:
                    meaningful_parts['X2'] = part  # Feature count

            results_dir = os.path.join(root, 'Results')  # Construct the path to the 'Results' directory
            print(f"Processing 'Results' folder at: {results_dir}")
            results_df = process_results_directory(results_dir, meaningful_parts)  # Process the results directory
            if not results_df.empty:  # If the results DataFrame is not empty
                csv_path = os.path.join(results_dir, 'M_average_feature_importance.csv')  # Path to save the results CSV
                results_df.to_csv(csv_path, index=False)  # Save the results DataFrame to a CSV file
                print(f"Average feature importance results saved to {csv_path}")

                # Create a numeric tuple for sorting X1
                results_df['X1_numeric'] = results_df['X1'].apply(extract_numeric_parts)
                
                # Append to the master DataFrame
                master_df = pd.concat([master_df, results_df], ignore_index=True)

    # Sort the master DataFrame
    if not master_df.empty:  # If the master DataFrame is not empty
        master_df = master_df.sort_values(by=['X1_numeric', 'Feature'])  # Sort by numeric parts of X1 and Feature
        master_df.drop('X1_numeric', axis=1, inplace=True)  # Remove the auxiliary column after sorting
        master_csv_path = os.path.join(start_dir, 'M_master_feature_importance.csv')  # Path to save the master CSV
        master_df.to_csv(master_csv_path, index=False)  # Save the master DataFrame to a CSV file
        print(f"Master feature importance results saved to {master_csv_path}")  # Print confirmation

In [None]:
# Start the process from the current working directory
find_and_process_results('.')