In [None]:
%%bash
# Function: extract_eids
# Description:
# On the RAP imaging data can be found within the bulk folder within a dispensed project. 
# Each image type/sequence per paricipant is in a zip file  with naming convention *{eid}_{field_id}_{instance_0}.zip*. 
# This function checks the field id is a bulk imaging field and if it is searches and gathers all file paths for a specified field id and extracts specific parameters. 
# The results are written to an output file in csv format.
#
# Parameters:
#   $1 - output_file: The file where the results will be saved.
#   $@ - field_ids: An array of field IDs to search and process. These field_ids should be bulk imaging fields.
#
# Usage:
#   extract_eids "output.csv" 20204 20254

extract_eids(){
 local output_file="$1"
 shift
 local field_ids=("$@")

 # Print header to the output file if it doesn't exist
 if [ ! -f "$output_file" ]; then
   echo "filepath,param,eid,field_id,ins" > "$output_file"
 fi

 for field_id in "${field_ids[@]}"; do
   # Capture the first line of output with timeout for reading
   first_line=$( dx find data --property field_id="$field_id" 2>/dev/null | head -n 1)
    
   if [[ "$first_line" == *"Bulk"* ]]; then
     # 'Bulk' found in the filepath, process this field_id
     dx find data --property field_id="$field_id" 2>/dev/null | awk -F'/' '
     { # Find the index of "/Bulk/"
       start_index = index($0, "/Bulk/")
       # Find the index of ".zip"
       end_index = index($0, ".zip") + 4
       # Extract filepath starting from "/Bulk/" up to ".zip"
       filepath = substr($0, start_index, end_index - start_index)
       split($6, a, "_")
       param = $4
       eid = a[1]
       field_id = a[2]
       ins = a[3]
       print filepath "," param "," eid "," field_id "," ins
     }
     ' >> "$output_file"
   else
     # 'Bulk' not found in the filepath
     echo "Field ID $field_id is not a bulk field."
   fi
 done
}

In [None]:
#fMRI_fields = ['31016','31018','31019','31015','31014']
# 31016	fMRI timeseries Glasser	Functional time series
# 31018 fMRI timeseries Schaefer7ns 100p to 1000p
# 31019 fMRI timeseries Tian Subcortex S1 to S4 3T
# 31015 fMRI timeseries aparc	Functional time series
# 31014 fMRI timeseries aparc

!extract_eids "imaging_eids.csv" 31016 31018 31019 31015 31014

In [None]:
!dx upload "imaging_eids.csv"

In [1]:
#for test
#awk -F',' 'NR > 1 && NR <= 1000 {print $1}' anxiety_fMRI_participants.csv > file_paths1.txt
#awk -F',' 'NR > 48302 {print $1}' anxiety_fMRI_participants.csv > file_paths.txt
#for all
!awk -F',' '{print $1}' need_download_data_set.csv > file_paths.txt

In [None]:
%%bash
# Function: download_imaging_files()
# Description:
#   This function automates the download of imaging files from a RAP project based
#   on paths specified in a txt file.
#
# Parameters:
#   $1 - A txt file containing file paths to bulk imaging files on the RAP
#
# Example Usage:
#  download_imaging_files file_paths.txt

download_imaging_files() {
  # Check if the file_paths.txt argument is provided and exists
  if [[ -z "$1" || ! -f "$1" ]]; then
    echo "Error: file_paths.txt is not provided."
    return 1
  fi

  # Get the project ID
  local PR
  PR=$(dx env | grep project- | cut -f 2)
  
  # Check if the project ID was successfully retrieved
  if [[ -z "$PR" ]]; then
    echo "Error: Could not retrieve project ID."
    return 1
  fi

  # Construct and execute the dx download commands for each line in the file_paths.txt
  local file_paths_file=$1
  local $output_dir=$2
  while IFS= read -r file_path; do
    local command="dx download --lightweight ${PR}:\"${file_path}\" -o $output_dir"
    eval "$command"
  done < "$file_paths_file"
}

In [None]:
current_dir=$(pwd)
output_dir="${current_dir}/data"
download_imaging_files file_paths.txt $output_dir

In [3]:
import pandas as pd
import os
# Get the current working directory
current_dir = os.getcwd()
df = pd.read_csv("need_download_data_set.csv")
df_name = df['filepath'].str.split(r'[/]', expand=True)[5]
data_path = os.path.join(current_dir, 'data/output.txt')
df_name.to_csv(data_path, sep='\t', index=False, header=False)
#only if data did not fully downloaded
#df_name_left=df_name.iloc[48301:,]
#df_name_left.to_csv('output.txt', sep='\t', index=False, header=False)

In [None]:
%%bash
# Function: upload_imaging_files()
# Description:
#   This function automates the download of imaging files from a RAP project based
#   on paths specified in a txt file.
#
# Parameters:
#   $1 - A txt file containing file paths to bulk imaging files on the RAP
#
# Example Usage:
#  upload_imaging_files file_paths.txt 

upload_imaging_files() {
  # Check if the file_paths.txt argument is provided and exists
  if [[ -z "$1" || ! -f "$1" ]]; then
    echo "Error: file_paths.txt is not provided."
    return 1
  fi

  # Get the project ID
  local PR
  PR=$(dx env | grep project- | cut -f 2)
  
  # Check if the project ID was successfully retrieved
  if [[ -z "$PR" ]]; then
    echo "Error: Could not retrieve project ID."
    return 1
  fi

  # Construct and execute the dx download commands for each line in the file_paths.txt
  local file_paths_file=$1
  while IFS= read -r file_path; do
    local command="dx upload "${file_path}" --path "/data/""
    eval "$command"
  done < "$file_paths_file"
}

In [None]:
!upload_imaging_files output.txt