In [1]:
ls

Untitled.ipynb               top_25_features_RF.csv
top_25_features_LDA.csv      top_25_features_SVM.csv
top_25_features_LR.csv       top_25_features_XGBoost.csv


In [2]:
import os
import pandas as pd
from collections import Counter

In [36]:
folder_path = '/Users/nafi/Desktop/SmartHealth Research/my_data/ML Pipeline/Top 25 Features/Datasets'

In [38]:
# Initialize a Counter to store feature counts
feature_counter = Counter()

# Dictionary to store additional feature information
feature_info = {}

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):  # Ensure we're processing CSV files
        file_path = os.path.join(folder_path, file_name)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Check if the required columns exist
        if 'feature' in df.columns and 'type_OS' in df.columns and 'type_MN' in df.columns:
            # Update the Counter with the values from the 'feature' column
            for _, row in df.iterrows():
                feature = str(row['feature'])  # Convert to string
                type_OS = row['type_OS']
                type_MN = row['type_MN']
                
                # Update frequency and merge source filenames
                if feature not in feature_info:
                    feature_info[feature] = {
                        'type_OS': type_OS,
                        'type_MN': type_MN,
                        'source': set()  # Use a set to avoid duplicate filenames
                    }
                # Add the filename without the .csv extension
                feature_info[feature]['source'].add(file_name.replace('.csv', ''))
                feature_counter[feature] += 1

# Prepare data for the DataFrame
data = []
for feature, frequency in feature_counter.items():
    type_OS = feature_info[feature]['type_OS']
    type_MN = feature_info[feature]['type_MN']
    source = ', '.join(sorted(feature_info[feature]['source']))  # Combine filenames into a string
    data.append({
        'feature': feature,
        'type_OS': type_OS,
        'type_MN': type_MN,
        'frequency': frequency,
        'source': source
    })

# Create the DataFrame
final_df = pd.DataFrame(data)

# Reorder columns as requested
final_df = final_df[['feature', 'type_OS', 'type_MN', 'frequency', 'source']]

# Sort the DataFrame by frequency in descending order
final_df = final_df.sort_values(by='frequency', ascending=False).reset_index(drop=True)

In [39]:
final_df

Unnamed: 0,feature,type_OS,type_MN,frequency,source
0,SCAU7,Self Report,Non Motor,3,"LDA, LR, SVM"
1,FNCDTCOG,Self Report,Non Motor,3,"LDA, LR, SVM"
2,SCAU26C,Self Report,Non Motor,3,"LDA, LR, SVM"
3,NP3RISNG,Objective,Motor,3,"LDA, LR, RF"
4,NP3KTRMR,Objective,Motor,3,"LDA, LR, XGBoost"
5,NP3KTRML,Objective,Motor,3,"LDA, LR, SVM"
6,SLPINJUR,Self Report,Non Motor,3,"LDA, LR, SVM"
7,NP3TTAPR,Objective,Motor,2,"RF, XGBoost"
8,NP3RIGRU,Objective,Motor,2,"RF, XGBoost"
9,NP3RIGLU,Objective,Motor,2,"RF, XGBoost"


In [41]:
final_df.to_csv('top_feature_frequencies.csv', index=False)