In [None]:
# Feature engineering

In [12]:
import os
import numpy as np
import pandas as pd

# Paths
base_dir = "Network FC matrices"
scores_file = "behavioral_no_missing/general_psychopathology_scores_ids.csv"

# Define networks and tasks
networks = {
    "default mode network": "default",
    "dorsal attention network": "dorsal",
    "frontoparietal control network": "frontoparietal",
    "limbic network": "limbic",
    "salience network": "salience"
}
tasks = {
    "Resting-State": ["rest_AP_run01", "rest_AP_run02", "rest_PA_run01", "rest_PA_run02"],
    "Hammer": ["hammer_AP_run01"],
    "Stroop": ["stroop_AP_run01", "stroop_PA_run01"]
}

# Load general psychopathology scores
scores_df = pd.read_csv(scores_file)

# Prepare DataFrame for storing results
columns = ["participant_id"] + [f"{task}_{network}" for task in tasks.keys() for network in networks.keys()] + ["general_psychopathology"]
results_df = pd.DataFrame(columns=columns)

# Iterate through participants in the scores file
for participant_id in scores_df["participant_id"]:
    participant_row = {"participant_id": participant_id}
    
    for task_name, runs in tasks.items():
        for network_full, network_short in networks.items():
            network_dir = os.path.join(base_dir, network_full)
            
            # Gather all FC matrices for the participant, task, and network
            task_matrices = []
            for run in runs:
                file_path = os.path.join(network_dir, run, f"{participant_id}_{network_short}_FC_matrix.npy")
                if os.path.exists(file_path):
                    fc_matrix = np.load(file_path)
                    degree_strength = np.sum(fc_matrix, axis=1)
                    average_degree_strength = np.mean(degree_strength)
                    task_matrices.append(average_degree_strength)
            
            # Compute average across all runs for this task
            if task_matrices:
                participant_row[f"{task_name}_{network_full}"] = np.mean(task_matrices)
            else:
                participant_row[f"{task_name}_{network_full}"] = np.nan
    
    # Add general psychopathology score
    participant_row["general_psychopathology"] = scores_df.loc[scores_df["participant_id"] == participant_id, "general_psychopathology"].values[0]
    
    # Append to results DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([participant_row])], ignore_index=True)

# Display the DataFrame for confirmation
print(results_df.head())


     participant_id  Resting-State_default mode network  \
0  NDAR_INVDW733XXB                           13.646667   
1  NDAR_INVEV975LY3                           13.201064   
2  NDAR_INVKV870NBK                            8.893670   
3  NDAR_INVXJ707NAE                           17.706718   
4  NDAR_INVWD467AR0                           13.711805   

   Resting-State_dorsal attention network  \
0                               13.059157   
1                                9.820026   
2                               11.173023   
3                               10.484619   
4                               11.928479   

   Resting-State_frontoparietal control network  Resting-State_limbic network  \
0                                      8.299604                      1.761055   
1                                      7.776814                      1.764223   
2                                      9.898683                      1.703379   
3                                      7.932515   

In [13]:
import numpy as np

# Paths to the base directory and participant ID
base_dir = "Network FC matrices"
participant_id = "NDAR_INVHW100CDA"

# Networks and Stroop task runs
networks_to_fix = {
    "frontoparietal control network": "frontoparietal",
    "limbic network": "limbic"
}
stroop_runs = ["stroop_AP_run01", "stroop_PA_run01"]

# Compute averages for the missing values
for network_full, network_short in networks_to_fix.items():
    task_matrices = []
    
    # Load files for the Stroop task runs
    for run in stroop_runs:
        file_path = os.path.join(base_dir, network_full, run, f"{participant_id}_{network_short}_FC_matrix.npy")
        print(f"Checking file: {file_path}")
        
        if os.path.exists(file_path):
            try:
                fc_matrix = np.load(file_path)
                print(f"Matrix shape: {fc_matrix.shape}, Min: {np.nanmin(fc_matrix)}, Max: {np.nanmax(fc_matrix)}")
                
                # Handle NaN values in the matrix
                fc_matrix = np.nan_to_num(fc_matrix, nan=0.0)
                
                degree_strength = np.sum(fc_matrix, axis=1)
                average_degree_strength = np.mean(degree_strength)
                task_matrices.append(average_degree_strength)
                print(f"Degree strength for {run} in {network_full}: {average_degree_strength}")
            except Exception as e:
                print(f"Error loading file {file_path}: {e}")
        else:
            print(f"File missing: {file_path}")
    
    # Compute the average across runs
    if task_matrices:
        average_value = np.mean(task_matrices)
        print(f"Computed average for {network_full}: {average_value}")
        
        # Update the DataFrame
        column_name = f"Stroop_{network_full}"
        results_df.loc[results_df["participant_id"] == participant_id, column_name] = average_value
    else:
        print(f"No data available for {network_full}. Cannot compute average.")


Checking file: Network FC matrices\frontoparietal control network\stroop_AP_run01\NDAR_INVHW100CDA_frontoparietal_FC_matrix.npy
Matrix shape: (61, 61), Min: -0.619330883026123, Max: 1.0
Degree strength for stroop_AP_run01 in frontoparietal control network: 7.262156009674072
Checking file: Network FC matrices\frontoparietal control network\stroop_PA_run01\NDAR_INVHW100CDA_frontoparietal_FC_matrix.npy
Matrix shape: (61, 61), Min: -0.97288978099823, Max: 1.0
Degree strength for stroop_PA_run01 in frontoparietal control network: 6.50901460647583
Computed average for frontoparietal control network: 6.885585308074951
Checking file: Network FC matrices\limbic network\stroop_AP_run01\NDAR_INVHW100CDA_limbic_FC_matrix.npy
Matrix shape: (50, 50), Min: -0.713406503200531, Max: 1.0
Degree strength for stroop_AP_run01 in limbic network: 1.655250906944275
Checking file: Network FC matrices\limbic network\stroop_PA_run01\NDAR_INVHW100CDA_limbic_FC_matrix.npy
Matrix shape: (50, 50), Min: -0.9763019084

In [14]:
# Update the DataFrame with computed averages
results_df.loc[results_df["participant_id"] == "NDAR_INVHW100CDA", "Stroop_frontoparietal control network"] = 6.8856
results_df.loc[results_df["participant_id"] == "NDAR_INVHW100CDA", "Stroop_limbic network"] = 2.1755

print("Missing values successfully updated in results_df.")


Missing values successfully updated in results_df.


In [18]:
results_df["Stroop_frontoparietal control network_AVG"] = results_df["Stroop_frontoparietal control network_AVG"].astype("float32")
results_df["Stroop_limbic network_AVG"] = results_df["Stroop_limbic network_AVG"].astype("float32")

In [15]:
# Convert 'general_psychopathology' column to float32
results_df["general_psychopathology"] = results_df["general_psychopathology"].astype("float32")

# Rearrange columns to make 'general_psychopathology' the second column
columns_order = ["participant_id", "general_psychopathology"] + [col for col in results_df.columns if col not in ["participant_id", "general_psychopathology"]]
results_df = results_df[columns_order]

# Display the updated DataFrame
results_df.head()


Unnamed: 0,participant_id,general_psychopathology,Resting-State_default mode network,Resting-State_dorsal attention network,Resting-State_frontoparietal control network,Resting-State_limbic network,Resting-State_salience network,Hammer_default mode network,Hammer_dorsal attention network,Hammer_frontoparietal control network,Hammer_limbic network,Hammer_salience network,Stroop_default mode network,Stroop_dorsal attention network,Stroop_frontoparietal control network,Stroop_limbic network,Stroop_salience network
0,NDAR_INVDW733XXB,96.92334,13.646667,13.059157,8.299604,1.761055,6.146852,17.280886,17.249392,8.036358,1.913847,6.567103,14.991718,13.020707,7.454109,1.793485,8.307344
1,NDAR_INVEV975LY3,119.80352,13.201064,9.820026,7.776814,1.764223,6.102838,11.745605,3.755147,5.996871,2.045903,4.768569,10.988958,8.801472,5.517537,1.613445,5.476304
2,NDAR_INVKV870NBK,113.809578,8.89367,11.173023,9.898683,1.703379,5.638009,11.052229,10.302407,8.655916,1.646078,3.278213,11.536163,9.989977,8.651979,1.370737,3.515229
3,NDAR_INVXJ707NAE,76.470001,17.706718,10.484619,7.932515,1.282942,7.967196,12.004824,12.015065,7.308599,1.365117,4.770614,13.422506,9.964056,4.543045,1.230238,4.777905
4,NDAR_INVWD467AR0,85.819786,13.711805,11.928479,9.333899,2.38388,9.843979,14.002593,9.433897,9.235421,1.54125,4.534468,15.520726,11.021715,7.944976,1.731834,6.728717


In [16]:
# Rename columns containing 'network' to append '_AVG'
results_df.rename(
    columns={col: f"{col}_AVG" for col in results_df.columns if 'network' in col}, 
    inplace=True
)

In [19]:
# Display the datatypes of all columns in the DataFrame
print("Column DataTypes:")
print(results_df.dtypes)

# Optionally, summarize the types of columns (e.g., number of numeric vs non-numeric)
print("\nSummary of DataTypes:")
print(results_df.dtypes.value_counts())


Column DataTypes:
participant_id                                       object
general_psychopathology                             float32
Resting-State_default mode network_AVG              float32
Resting-State_dorsal attention network_AVG          float32
Resting-State_frontoparietal control network_AVG    float32
Resting-State_limbic network_AVG                    float32
Resting-State_salience network_AVG                  float32
Hammer_default mode network_AVG                     float32
Hammer_dorsal attention network_AVG                 float32
Hammer_frontoparietal control network_AVG           float32
Hammer_limbic network_AVG                           float32
Hammer_salience network_AVG                         float32
Stroop_default mode network_AVG                     float32
Stroop_dorsal attention network_AVG                 float32
Stroop_frontoparietal control network_AVG           float32
Stroop_limbic network_AVG                           float32
Stroop_salience networ

In [20]:
# Save df into CSV file
results_df.to_csv("baseline_model_data.csv", index=False)

# First data to throw in models is ready, yay!

In [21]:
# Check for missing values in the entire DataFrame
missing_counts = results_df.isnull().sum()

# Filter columns with missing values
missing_columns = missing_counts[missing_counts > 0]

# Display missing value information
print("Missing values per column:")
print(missing_columns)

# Total number of missing values
total_missing = results_df.isnull().sum().sum()
print(f"\nTotal missing values in the DataFrame: {total_missing}")

# Percentage of missing values per column
missing_percentage = (missing_columns / len(results_df)) * 100
print("\nPercentage of missing values per column:")
print(missing_percentage)

# Identify rows with missing values
rows_with_missing = results_df[results_df.isnull().any(axis=1)]

# Display participant IDs and corresponding columns with missing values
print("\nRows with missing values:")
for index, row in rows_with_missing.iterrows():
    participant_id = row['participant_id']
    missing_columns = row[row.isnull()].index.tolist()  # Get columns with missing data
    print(f"Participant ID: {participant_id}, Missing Data in Columns: {missing_columns}")


Missing values per column:
Series([], dtype: int64)

Total missing values in the DataFrame: 0

Percentage of missing values per column:
Series([], dtype: float64)

Rows with missing values:
