# 2/4: Data preprocessing 1
By Niloufar Shahdoust (niloufar.shahdoust@utah.edu)

In [1]:
import os
import mat73
import numpy as np
import pandas as pd
from matplotlib import cm
from ast import literal_eval
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from visbrain.objects import BrainObj, SceneObj, SourceObj 

In [2]:
input_folder = '1_brain_visualization_data_retrieval'
output_folder = '2_brain_visualization_preProcessing_1'

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get a list of all CSV files in the input folder
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Process each file
for file_name in csv_files:
    # Read the CSV file
    file_path = os.path.join(input_folder, file_name)
    df = pd.read_csv(file_path)
    
    # Step 1: Process 'Nmm_atlas' column
    if 'Nmm_atlas' in df.columns:
        df['Nmm_atlas'] = df['Nmm_atlas'].str.strip()  # Remove extra spaces
        df = df[~df['Nmm_atlas'].str.contains('Cerebral White Matter|Unknown', case=False, na=False)]
        
        # Remove [' and '] from elements
        df['Nmm_atlas'] = df['Nmm_atlas'].str.replace(r"^\['|'\]$", '', regex=True)
        
        # Create 'area' column by removing 'Left' or 'Right' from 'Nmm_atlas'
        df['area'] = df['Nmm_atlas'].str.replace(r'^(Left|Right)\s+', '', regex=True)
    
    # Step 2: Process 'MNI_Coordinates' column
    if 'MNI_Coordinates' in df.columns:
        def parse_coordinates(coord):
            try:
                parsed = literal_eval(coord)
                # Ensure the parsed value is a list and not [nan, nan, nan]
                if isinstance(parsed, list) and all(isinstance(x, float) for x in parsed):
                    return parsed
                else:
                    return None
            except (ValueError, SyntaxError):
                return None

        # Convert MNI_Coordinates to lists and filter out invalid rows
        df['parsed_coordinates'] = df['MNI_Coordinates'].apply(parse_coordinates)
        df = df.dropna(subset=['parsed_coordinates'])
        
        # Split the parsed coordinates into separate columns
        df['coordinate_x'] = df['parsed_coordinates'].apply(lambda x: x[0])
        df['coordinate_y'] = df['parsed_coordinates'].apply(lambda x: x[1])
        df['coordinate_z'] = df['parsed_coordinates'].apply(lambda x: x[2])
        
        # Drop the original columns
        df = df.drop(columns=['MNI_Coordinates', 'parsed_coordinates'])
    
    # Save the modified dataframe to the output folder with the same name
    output_path = os.path.join(output_folder, file_name)
    df.to_csv(output_path, index=False)

# List files to confirm output
os.listdir(output_folder)


['201810.csv',
 '201811.csv',
 '201901.csv',
 '201902.csv',
 '201902r.csv',
 '201903.csv',
 '201905.csv',
 '201909.csv',
 '201910.csv',
 '201911.csv',
 '201913.csv',
 '201914.csv',
 '201915.csv',
 '202001.csv',
 '202002.csv',
 '202003.csv',
 '202004.csv',
 '202005.csv',
 '202006.csv',
 '202006u.csv',
 '202007.csv',
 '202008.csv',
 '202009.csv',
 '202011.csv',
 '202014.csv',
 '202015.csv',
 '202016.csv',
 '202105.csv',
 '202107.csv',
 '202110.csv',
 '202114.csv',
 '202117.csv',
 '202118.csv',
 '202201.csv',
 '202202.csv',
 '202205.csv',
 '202207.csv',
 '202208.csv',
 '202209.csv',
 '202212.csv',
 '202212b.csv',
 '202214.csv',
 '202215.csv',
 '202216.csv',
 '202217.csv',
 '202302.csv',
 '202306.csv',
 '202307.csv',
 '202308.csv',
 '202309.csv',
 '202311.csv',
 '202314a.csv',
 '202314b.csv',
 '202401.csv',
 '202405.csv',
 '202406.csv',
 '202407.csv',
 '202408.csv',
 '202409.csv',
 '202413a.csv',
 '202413b.csv',
 '202414.csv',
 '202417.csv',
 '202418.csv',
 '202418b.csv']