In [None]:
# Function: extract_eids
# Description:
# On the RAP imaging data can be found within the bulk folder within a dispensed project. 
# Each image type/sequence per paricipant is in a zip file  with naming convention *{eid}_{field_id}_{instance_0}.zip*. 
# This function checks the field id is a bulk imaging field and if it is searches and gathers all file paths for a specified field id and extracts specific parameters. 
# The results are written to an output file in csv format.
#
# Parameters:
#   $1 - output_file: The file where the results will be saved.
#   $@ - field_ids: An array of field IDs to search and process. These field_ids should be bulk imaging fields.
#
# Usage:
#   extract_eids "output.csv" 20204 20254

extract_eids(){
 local output_file="$1"
 shift
 local field_ids=("$@")

 # Print header to the output file if it doesn't exist
 if [ ! -f "$output_file" ]; then
   echo "filepath,param,eid,field_id,ins" > "$output_file"
 fi

 for field_id in "${field_ids[@]}"; do
   # Capture the first line of output with timeout for reading
   first_line=$( dx find data --property field_id="$field_id" 2>/dev/null | head -n 1)
    
   if [[ "$first_line" == *"Bulk"* ]]; then
     # 'Bulk' found in the filepath, process this field_id
     dx find data --property field_id="$field_id" 2>/dev/null | awk -F'/' '
     { # Find the index of "/Bulk/"
       start_index = index($0, "/Bulk/")
       # Find the index of ".zip"
       end_index = index($0, ".zip") + 4
       # Extract filepath starting from "/Bulk/" up to ".zip"
       filepath = substr($0, start_index, end_index - start_index)
       split($6, a, "_")
       param = $4
       eid = a[1]
       field_id = a[2]
       ins = a[3]
       print filepath "," param "," eid "," field_id "," ins
     }
     ' >> "$output_file"
   else
     # 'Bulk' not found in the filepath
     echo "Field ID $field_id is not a bulk field."
   fi
 done
}

In [None]:
#fMRI_fields = ['31016','31018','31019','31015','31014']
extract_eids "imaging_eids.csv" 31016 31018 31019 31015 31014

In [None]:
dx upload "imaging_eids.csv"

In [1]:
#for test
#awk -F',' 'NR > 1 && NR <= 1000 {print $1}' anxiety_fMRI_participants.csv > file_paths1.txt
#awk -F',' 'NR > 48302 {print $1}' anxiety_fMRI_participants.csv > file_paths.txt
#for all
awk -F',' '{print $1}' anxiety_fMRI_participants.csv > file_paths.txt

SyntaxError: invalid syntax (24921675.py, line 1)

In [None]:
# Function: download_imaging_files()
# Description:
#   This function automates the download of imaging files from a RAP project based
#   on paths specified in a txt file.
#
# Parameters:
#   $1 - A txt file containing file paths to bulk imaging files on the RAP
#
# Example Usage:
#  download_imaging_files file_paths.txt

download_imaging_files() {
  # Check if the file_paths.txt argument is provided and exists
  if [[ -z "$1" || ! -f "$1" ]]; then
    echo "Error: file_paths.txt is not provided."
    return 1
  fi

  # Get the project ID
  local PR
  PR=$(dx env | grep project- | cut -f 2)
  
  # Check if the project ID was successfully retrieved
  if [[ -z "$PR" ]]; then
    echo "Error: Could not retrieve project ID."
    return 1
  fi

  # Construct and execute the dx download commands for each line in the file_paths.txt
  local file_paths_file=$1
  local $output_dir=$2
  while IFS= read -r file_path; do
    local command="dx download --lightweight ${PR}:\"${file_path}\" -o $output_dir"
    eval "$command"
  done < "$file_paths_file"
}

In [None]:
current_dir=$(pwd)
output_dir="${current_dir}/data/"
download_imaging_files file_paths.txt $output_dir

In [None]:
#for f in *.zip; do unzip "$f" -d "${f%.zip}"; done
#dx mkdir data
#for f in *.zip; do dx upload "$f" --path "/data/"; done

In [3]:
import pandas as pd
df = pd.read_csv("anxiety_fMRI_participants.csv")
df_name = df['filepath'].str.split(r'[/]', expand=True)[5]
df_name.to_csv('output.txt', sep='\t', index=False, header=False)
df_name_left=df_name.iloc[48301:,]
df_name_left.to_csv('output.txt', sep='\t', index=False, header=False)

In [None]:
# Function: upload_imaging_files()
# Description:
#   This function automates the download of imaging files from a RAP project based
#   on paths specified in a txt file.
#
# Parameters:
#   $1 - A txt file containing file paths to bulk imaging files on the RAP
#
# Example Usage:
#  upload_imaging_files file_paths.txt 

upload_imaging_files() {
  # Check if the file_paths.txt argument is provided and exists
  if [[ -z "$1" || ! -f "$1" ]]; then
    echo "Error: file_paths.txt is not provided."
    return 1
  fi

  # Get the project ID
  local PR
  PR=$(dx env | grep project- | cut -f 2)
  
  # Check if the project ID was successfully retrieved
  if [[ -z "$PR" ]]; then
    echo "Error: Could not retrieve project ID."
    return 1
  fi

  # Construct and execute the dx download commands for each line in the file_paths.txt
  local file_paths_file=$1
  while IFS= read -r file_path; do
    local command="dx upload "${file_path}" --path "/data1/""
    eval "$command"
  done < "$file_paths_file"
}

In [None]:
upload_imaging_files output.txt

In [None]:
dx download --lightweight -fr data

In [74]:
#get list for files have not been download successfully
#df =  pd.read_csv("file_paths.txt", header=None)
result_file_path = os.path.join('/Users/xiaoqianxiao/UKB/data/derivatives', 'output.txt')
df =  pd.read_csv("file_paths.txt")
df['filename'] = df['filepath'].str.split(r'[/]', expand=True)[5]
directory = '/Users/xiaoqianxiao/UKB/data/derivatives/downloaded_UKB'
file_names = os.listdir(directory)
df['done_files'] = df['filename'].apply(lambda x: (x in file_names)).replace((1,0),('True','False'))
df.loc[df['done_files'] == False]['filepath'].to_csv('file_paths_remain.txt', sep='\t', index=False, header=False)
df.loc[df['done_files'] == False]['filename'].to_csv(result_file_path, sep='\t', index=False, header=False)

In [45]:
#unzip files
sourceData_path='/Users/xiaoqianxiao/UKB/data/derivatives/downloaded_UKB2'
timeseries_path='/Users/xiaoqianxiao/UKB/data/derivatives/unziped_UKB'
cd $sourceData_path
for f in *.zip; do unzip "$f" -d "${timeseries_path}/${f%.zip}"; done

Unnamed: 0,filepath,filename,done_files
0,/Bulk/Brain MRI/Functional time series/10/1025...,1025811_31016_2_0.zip,True
1,/Bulk/Brain MRI/Functional time series/16/1682...,1682145_31016_2_0.zip,True
2,/Bulk/Brain MRI/Functional time series/34/3481...,3481405_31016_2_0.zip,True
3,/Bulk/Brain MRI/Functional time series/13/1340...,1340911_31016_2_0.zip,True
4,/Bulk/Brain MRI/Functional time series/42/4287...,4287014_31016_2_0.zip,True


In [1]:
#transform to BIDS format
import os
import re
import shutil
# Set the path to your directory containing the files
base_dir = "/Users/xiaoqianxiao/UKB/data/derivatives/unziped_UKB"
target_dir = "/Users/xiaoqianxiao/UKB/data/derivatives/timeseries"
# Function to rename files to BIDS format
#session: Instance(2=Imaging visit; 3=First repeat imaging visit)
def rename_to_bids(base_dir):
    # Loop through each subdirectory in the base directory
    for subject_dir in os.listdir(base_dir):
        subject_path = os.path.join(base_dir, subject_dir)

        if os.path.isdir(subject_path):
            # Extract the subject ID from the folder name
            subject_id = re.split(r'[_\s;]+', subject_dir)[0]
            session_id =re.split(r'[_\s;]+', subject_dir)[2]
        
            # Loop through the files in the subject directory
            for filename in os.listdir(subject_path):
                space_id = re.split(r'[.\s;]+', filename)[1]
                if filename.endswith('.csv.gz'):
                    #sub-subID_ses-mri_run-runID_task-{task_name}_space-space.csv.gz
                    task_name = "rest"  # Modify this if you have different tasks
                    new_filename = f"sub-{subject_id}_ses-{session_id}_task-{task_name}_space-{space_id}.csv.gz"
                    new_file_path = os.path.join(target_dir, new_filename)

                    # Full path of the old file
                    old_file_path = os.path.join(subject_path, filename)

                    # Rename the file
                    os.rename(old_file_path, new_file_path)
                    #shutil.copy(old_file_path, new_file_path)
                    #print(f"Renamed: {old_file_path} to {new_file_path}")

# Call the function
rename_to_bids(base_dir)

In [69]:
help(re)

Help on package re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.12/library/re.html

    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.

    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last matc