### Import necessary packages

In [1]:
import os
import fnmatch
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF # for reproducibility, need to install skrebate version 0.62
from skrebate import SURF
from skrebate import SURFstar
from skrebate import MultiSURF
from skrebate import MultiSURFstar
from skrebate import TuRF
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

### Define function for directory maintenance

In [2]:
def ensure_dir(directory): # Check if directories exist. If they do not, they will be created in the next function
    if not os.path.exists(directory):
        os.makedirs(directory)

### Define functions for random shuffle

In [3]:
def random_shuffle(file_path):
    # Define the directory to store results
    results_dir = os.path.join(os.path.dirname(file_path), "Results", "RandomShuffle")
    ensure_dir(results_dir)  # Ensure the Results/RandomShuffle directory exists

    # Read the .txt file into a DataFrame
    try:
        df = pd.read_csv(file_path, sep='\t')  # Assuming tab-separated values; adjust the separator as needed
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return

    # Ensure 'Class' column is not included in the shuffle
    if 'Class' in df.columns:
        columns_to_shuffle = df.drop('Class', axis=1).columns.tolist()
    else:
        columns_to_shuffle = df.columns.tolist()

    # Shuffle the column names
    shuffled_columns = np.random.permutation(columns_to_shuffle)

    # Create a new DataFrame with the shuffled column names
    new_df = pd.DataFrame(shuffled_columns, columns=['Feature'])

    # Construct the output file name based on the original file's name
    base_name = os.path.basename(file_path)
    new_file_name = f"{os.path.splitext(base_name)[0]}_RandShuffle.txt"
    output_path = os.path.join(results_dir, new_file_name)

    # Write the new DataFrame to disk
    new_df.to_csv(output_path, index=False, sep='\t')

def find_and_random_shuffle(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    random_shuffle(file_path)

### Define current working directory (GAMETES_2.2_dev_peter_XOR)

In [3]:
root_dir = 'my_dir' # Change this to your parent directory path

### Run random shuffle process on directory

In [5]:
# Run through the directory and produce random assortments of the features. This is the Random Shuffle that will be presented in the paper
find_and_random_shuffle(root_dir)

### Define functions for Mutual Information

In [6]:
def process_mutual_info(file_path):
    # Load the genetic data
    genetic_data = pd.read_csv(file_path, sep='\t')  # Ensure correct delimiter is used
    
    # Split the data
    features, labels = genetic_data.drop('Class', axis=1).values, genetic_data['Class'].values
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    # Apply Mutual Information
    mi_scores = mutual_info_classif(X_train, y_train)
    
    # Pair feature names with MI scores
    temp_list = []
    for feature_name, mi_score in zip(genetic_data.drop('Class', axis=1).columns, mi_scores):
        temp_list.append([feature_name, mi_score])
    
    # Process results
    Results = pd.DataFrame(temp_list, columns=['Feature', 'Feature_Importance'])
    Results.sort_values(by='Feature_Importance', ascending=False, inplace=True)
    
    # Define directories
    base_dir = os.path.dirname(file_path)
    results_dir = os.path.join(base_dir, "Results")
    mi_dir = os.path.join(results_dir, "MutualInformation")
    ensure_dir(mi_dir)
    
    # Extract base filename without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save the files
    Results.to_csv(os.path.join(mi_dir, f"{base_name}_MIResults.txt"), index=False, sep='\t')

def find_and_mutual_info(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    process_mutual_info(file_path)

### Run Mutual Information on directory

In [7]:
# Run through the directory and perform Mutual Information.
find_and_mutual_info(root_dir)

### Define functions for ReliefF with 10 NN

In [4]:
def process_relieff10(file_path):
    # Load the genetic data
    genetic_data = pd.read_csv(file_path, sep = '\t')
    
    # Split the data
    features, labels = genetic_data.drop('Class', axis=1).values, genetic_data['Class'].values
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    # Apply ReliefF
    fs = ReliefF(n_features_to_select=2, n_neighbors=10)
    fs.fit(X_train, y_train)
    
    temp_list = []
    for feature_name, feature_score in zip(genetic_data.drop('Class', axis=1).columns, fs.feature_importances_):
        temp_list.append([feature_name, feature_score])
    
    # Process results
    Results = pd.DataFrame(temp_list, columns=['Feature', 'Feature_Importance'])
    ABSResults = Results.copy()
    ABSResults['ABS_Feature_Importance'] = ABSResults['Feature_Importance'].abs()
    Results.sort_values(by='Feature_Importance', ascending=False, inplace=True)
    ABSResults.sort_values(by='ABS_Feature_Importance', ascending=False, inplace=True)
    
    # Define directories
    base_dir = os.path.dirname(file_path)
    results_dir = os.path.join(base_dir, "Results")
    relief_dir = os.path.join(results_dir, "ReliefF10")
    abs_relief_dir = os.path.join(results_dir, "ABS_ReliefF10")
    ensure_dir(relief_dir)
    ensure_dir(abs_relief_dir)
    
    # Extract base filename without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save the files
    Results.to_csv(os.path.join(relief_dir, f"{base_name}_Results.txt"), index=False, sep='\t')
    ABSResults.to_csv(os.path.join(abs_relief_dir, f"{base_name}_ABSResults.txt"), index=False, sep='\t')

def find_and_relieff10(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
         # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    process_relieff10(file_path)

### Run ReliefF (10 NN) on directory

In [5]:
find_and_relieff10(root_dir)

### Define functions for ReliefF with 100 NN

In [8]:
def process_relieff(file_path):
    # Load the genetic data
    genetic_data = pd.read_csv(file_path, sep = '\t')
    
    # Split the data
    features, labels = genetic_data.drop('Class', axis=1).values, genetic_data['Class'].values
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    # Apply ReliefF
    fs = ReliefF(n_features_to_select=2, n_neighbors=100)
    fs.fit(X_train, y_train)
    
    temp_list = []
    for feature_name, feature_score in zip(genetic_data.drop('Class', axis=1).columns, fs.feature_importances_):
        temp_list.append([feature_name, feature_score])
    
    # Process results
    Results = pd.DataFrame(temp_list, columns=['Feature', 'Feature_Importance'])
    ABSResults = Results.copy()
    ABSResults['ABS_Feature_Importance'] = ABSResults['Feature_Importance'].abs()
    Results.sort_values(by='Feature_Importance', ascending=False, inplace=True)
    ABSResults.sort_values(by='ABS_Feature_Importance', ascending=False, inplace=True)
    
    # Define directories
    base_dir = os.path.dirname(file_path)
    results_dir = os.path.join(base_dir, "Results")
    relief_dir = os.path.join(results_dir, "ReliefF")
    abs_relief_dir = os.path.join(results_dir, "ABS_ReliefF")
    ensure_dir(relief_dir)
    ensure_dir(abs_relief_dir)
    
    # Extract base filename without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save the files
    Results.to_csv(os.path.join(relief_dir, f"{base_name}_Results.txt"), index=False, sep='\t')
    ABSResults.to_csv(os.path.join(abs_relief_dir, f"{base_name}_ABSResults.txt"), index=False, sep='\t')

def find_and_relieff(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
         # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    process_relieff(file_path)


### Run ReliefF (100 NN) on directory

In [9]:
# Run through the directory and perform ReliefF 100 NN.
find_and_relieff(root_dir)

### Define functions for MultiSURF

In [10]:
def process_MultiSurf(file_path):
    # Load the genetic data
    genetic_data = pd.read_csv(file_path, sep = '\t')
    
    # Split the data
    features, labels = genetic_data.drop('Class', axis=1).values, genetic_data['Class'].values
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    # Apply MultiSurf
    fs = MultiSURF()
    fs.fit(X_train, y_train)
    
    temp_list = []
    for feature_name, feature_score in zip(genetic_data.drop('Class', axis=1).columns, fs.feature_importances_):
        temp_list.append([feature_name, feature_score])
    
    # Process results
    Results = pd.DataFrame(temp_list, columns=['Feature', 'Feature_Importance'])
    ABSResults = Results.copy()
    ABSResults['ABS_Feature_Importance'] = ABSResults['Feature_Importance'].abs()
    Results.sort_values(by='Feature_Importance', ascending=False, inplace=True)
    ABSResults.sort_values(by='ABS_Feature_Importance', ascending=False, inplace=True)
    
    # Define directories
    base_dir = os.path.dirname(file_path)
    results_dir = os.path.join(base_dir, "Results")
    multisurf_dir = os.path.join(results_dir, "MultiSURF")
    abs_multisurf_dir = os.path.join(results_dir, "ABS_MultiSURF")
    ensure_dir(multisurf_dir)
    ensure_dir(abs_multisurf_dir)
    
    # Extract base filename without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save the files
    Results.to_csv(os.path.join(multisurf_dir, f"{base_name}_Results.txt"), index=False, sep='\t')
    ABSResults.to_csv(os.path.join(abs_multisurf_dir, f"{base_name}_ABSResults.txt"), index=False, sep='\t')

def find_and_MultiSurf(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
         # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    process_MultiSurf(file_path)

### Run MultiSURF on directory

In [11]:
# Run through the directory and perform MultiSURF.
find_and_MultiSurf(root_dir)

### Define functions for MultiSURFstar

In [12]:
def process_MultiSurfstar(file_path):
    # Load the genetic data
    genetic_data = pd.read_csv(file_path, sep = '\t')
    
    # Split the data
    features, labels = genetic_data.drop('Class', axis=1).values, genetic_data['Class'].values
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    
    # Apply MultiSurf
    fs = MultiSURFstar()
    fs.fit(X_train, y_train)
    
    temp_list = []
    for feature_name, feature_score in zip(genetic_data.drop('Class', axis=1).columns, fs.feature_importances_):
        temp_list.append([feature_name, feature_score])
    
    # Process results
    Results = pd.DataFrame(temp_list, columns=['Feature', 'Feature_Importance'])
    ABSResults = Results.copy()
    ABSResults['ABS_Feature_Importance'] = ABSResults['Feature_Importance'].abs()
    Results.sort_values(by='Feature_Importance', ascending=False, inplace=True)
    ABSResults.sort_values(by='ABS_Feature_Importance', ascending=False, inplace=True)
    
    # Define directories
    base_dir = os.path.dirname(file_path)
    results_dir = os.path.join(base_dir, "Results")
    multisurfstar_dir = os.path.join(results_dir, "MultiSURFstar")
    abs_multisurfstar_dir = os.path.join(results_dir, "ABS_MultiSURFstar")
    ensure_dir(multisurfstar_dir)
    ensure_dir(abs_multisurfstar_dir)
    
    # Extract base filename without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save the files
    Results.to_csv(os.path.join(multisurfstar_dir, f"{base_name}_Results.txt"), index=False, sep='\t')
    ABSResults.to_csv(os.path.join(abs_multisurfstar_dir, f"{base_name}_ABSResults.txt"), index=False, sep='\t')

def find_and_MultiSurfstar(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
         # Check if 'EDM' is in the directory name directly above the files
        if "EDM" in os.path.basename(dirpath):
            #print(f"Processing in EDM directory: {dirpath}")  # Debugging line
            for filename in filenames:
                if filename.endswith('.txt'):  # Ensure it's a .txt file
                    file_path = os.path.join(dirpath, filename)
                    #print(f"Processing file: {file_path}")  # Debugging line
                    process_MultiSurfstar(file_path)

### Run MultiSURFstar on directory

In [13]:
# Run through the first directory and perform MultiSURFstar.
find_and_MultiSurfstar(root_dir)