In [88]:
import os
import pandas as pd
import numpy as np
import scanpy as sc

In [83]:
import urllib.request

In [85]:
urllib.request.urlretrieve('ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403589/', 'HCAHeart9508627_S1_L001_I1_001.fastq.gz	')

('HCAHeart9508627_S1_L001_I1_001.fastq.gz\t',
 <email.message.Message at 0x7f4b2af52350>)

In [111]:
def batched_processing(
        df,
        sample_column,
        url_column,
        file_name_column,
        batch_size,
        output_path,
):
    """Writes a Number of files containing the sample name and its corresponding download url
    in a csv format according to a specified batch size"""
    samples = df[sample_column].unique()
    num_batches = int(np.ceil(len(samples) / batch_size))
    
    for batch_id in range(num_batches):
        start_id = batch_id * batch_size
        end_id = min((batch_id + 1) * batch_size, len(samples))
        batch_names = samples[start_id: end_id]
        
        with open(f"{output_path}/batch_{start_id}_{end_id}.txt", "w") as file:
            for name in batch_names:
                sample = str(name)
                filtered_df = df[df[sample_column] == name]
                for file_name_id in filtered_df[file_name_column].tolist():
                    file_name = str(filtered_df[filtered_df[file_name_column] == file_name_id][file_name_column].tolist()[0])
                    file_url = filtered_df[filtered_df[file_name_column] == file_name_id][url_column].tolist()[0]
                    
                    #file.write(f"{file_name},{file_url}\n")
                    file.write(f"axel -n 10 --output={file_name} {file_url}\n")

In [2]:
data_path = '../../raw_data/'
meta_data_file = 'E-MTAB-12916.sdrf.txt'

In [16]:
meta_data = pd.read_csv(os.path.join(data_path, meta_data_file), sep='\t')
meta_data

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[age],Characteristics[developmental stage],Characteristics[sex],Characteristics[individual],Characteristics[organism part],Characteristics[disease],...,Assay Name,Technology Type,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Comment[read_index],Comment[read_type],Factor Value[organism part]
0,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_I1_001.fastq.gz,HCAHeart9508627_S1_L001_I1_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,heart left ventricle
1,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_I2_001.fastq.gz,HCAHeart9508627_S1_L001_I2_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index2,sample_barcode,heart left ventricle
2,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_R1_001.fastq.gz,HCAHeart9508627_S1_L001_R1_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read1,"cell_barcode,umi_barcode",heart left ventricle
3,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_R2_001.fastq.gz,HCAHeart9508627_S1_L001_R2_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read2,single,heart left ventricle
4,HCAHeart9508628,ERS15408105,SAMEA113412974,Homo sapiens,60 to 65,adult,Male,D7,right cardiac atrium,normal,...,HCAHeart9508628,sequencing assay,ERX10811381,HCAHeart9508628_S1_L001_I1_001.fastq.gz,HCAHeart9508628_S1_L001_I1_001.fastq.gz,ERR11403590,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,right cardiac atrium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,HCAHeartST13189996,ERS15408149,SAMEA113413018,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189996,sequencing assay,ERX10811425,HCAHeartST13189996_S1_L001_R2_001.fastq.gz,HCAHeartST13189996_S1_L001_R2_001.fastq.gz,ERR11403634,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read2,single,atrioventriculeft cardiac atriumr node
184,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_I1_001.fastq.gz,HCAHeartST13189997_S1_L001_I1_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,atrioventriculeft cardiac atriumr node
185,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_I2_001.fastq.gz,HCAHeartST13189997_S1_L001_I2_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index2,sample_barcode,atrioventriculeft cardiac atriumr node
186,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_R1_001.fastq.gz,HCAHeartST13189997_S1_L001_R1_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read1,"cell_barcode,umi_barcode",atrioventriculeft cardiac atriumr node


In [134]:
def create_by_sample_mapping_files(
        df,
        sample_colum,
        filename_column,
        download_url_column,
        output_path
):
    for index, row in df.iterrows():
        sample_name = row[sample_colum]
        file_name = row[filename_column]
        file_url = row[download_url_column]
        with open(f'{output_path}/{sample_name}.txt', 'w') as file:
            file.write(f'{file_name}, {file_url}')

In [135]:
output_path = '../../download_batches/'
create_by_sample_mapping_files(meta_data, 'Source Name', 'Scan Name', 'Comment[FASTQ_URI]', output_path)

In [113]:
output_path = '../../download_batches/'
batched_processing(
    df=meta_data,
    sample_column='Source Name',
    url_column='Comment[FASTQ_URI]',
    file_name_column='Comment[SUBMITTED_FILE_NAME]',
    batch_size=5,
    output_path=output_path,
)
os.listdir(output_path)

['batch_35_40.txt',
 'batch_10_15.txt',
 'batch_45_47.txt',
 'batch_20_25.txt',
 'batch_0_5.txt',
 'batch_40_45.txt',
 'batch_25_30.txt',
 'batch_30_35.txt',
 'batch_5_10.txt',
 'batch_15_20.txt',
 '.batch_0_5.txt.swp']

In [114]:
%%bash
folder_path="../../download_batches/"

# Check if the folder exists
if [ ! -d "$folder_path" ]; then
    echo "Folder not found: $folder_path"
    exit 1
fi

# Loop over the files in the folder
for file in "$folder_path"/*; do
    if [ -f "$file" ]; then
        echo "Filename: $file"
        # Loop over each line in the file and print it
        while IFS= read -r line; do
            echo "$line"
        done < "$file"
    fi
done


Filename: ../../download_batches//batch_0_5.txt
axel -n 10 --output=HCAHeart9508627_S1_L001_I1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403589/HCAHeart9508627_S1_L001_I1_001.fastq.gz
axel -n 10 --output=HCAHeart9508627_S1_L001_I2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403589/HCAHeart9508627_S1_L001_I2_001.fastq.gz
axel -n 10 --output=HCAHeart9508627_S1_L001_R1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403589/HCAHeart9508627_S1_L001_R1_001.fastq.gz
axel -n 10 --output=HCAHeart9508627_S1_L001_R2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403589/HCAHeart9508627_S1_L001_R2_001.fastq.gz
axel -n 10 --output=HCAHeart9508628_S1_L001_I1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403590/HCAHeart9508628_S1_L001_I1_001.fastq.gz
axel -n 10 --output=HCAHeart9508628_S1_L001_I2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR11403590/HCAHeart9508628_S1_L001_I2_001.fastq.gz
axel -n 10 --output=HCAHeart9508628_S1_L