# Adjusting metadata table to make it suitable for downloading and mapping dataset ERP123138

Raw data downloaded from [here](https://www.ebi.ac.uk/ena/browser/view/PRJEB39602)

    Developed by: Christian Eger
    Würzburg Institute for Systems Immunology - Faculty of Medicine - Julius Maximilian Universität Würzburg
    Created on: 240603
    Last modified: 240605

## Import modules

In [2]:
import pandas as pd

## Inspect default table

In [3]:
df = pd.read_csv('.data/meta_data/filereport_read_run_PRJEB39602_tsv.txt', sep='\t', index_col=0)
df

Unnamed: 0_level_0,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,bam_ftp
study_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PRJEB39602,SAMEA7249749,ERX4319127,ERR6449746,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,
PRJEB39602,SAMEA7249695,ERX4319157,ERR6449748,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,
PRJEB39602,SAMEA7249707,ERX4319184,ERR6449751,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,,
PRJEB39602,SAMEA7249690,ERX4319178,ERR6449752,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449752/H...,,
PRJEB39602,SAMEA7249729,ERX4319192,ERR6449757,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449757/H...,,
...,...,...,...,...,...,...,...,...,...
PRJEB39602,SAMEA7249837,ERX4319213,ERR7423470,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423470/H...,,
PRJEB39602,SAMEA7762099,ERX4733969,ERR7423473,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423473/H...,,
PRJEB39602,SAMEA7249680,ERX4319154,ERR7423476,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,,
PRJEB39602,SAMEA7249680,ERX4319154,ERR7423477,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,


## Make Adjustments

Splitting up download link into individual links to download I1, R1 and R2 files

In [7]:
df[['I1_url', 'R1_url', 'R2_url']] = df['submitted_ftp'].str.split(';', expand=True)
df

Unnamed: 0_level_0,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,bam_ftp,I1_url,R1_url,R2_url
study_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PRJEB39602,SAMEA7249749,ERX4319127,ERR6449746,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...
PRJEB39602,SAMEA7249695,ERX4319157,ERR6449748,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...
PRJEB39602,SAMEA7249707,ERX4319184,ERR6449751,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...
PRJEB39602,SAMEA7249690,ERX4319178,ERR6449752,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449752/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449752/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449752/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449752/H...
PRJEB39602,SAMEA7249729,ERX4319192,ERR6449757,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449757/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449757/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449757/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449757/H...
...,...,...,...,...,...,...,...,...,...,...,...,...
PRJEB39602,SAMEA7249837,ERX4319213,ERR7423470,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423470/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423470/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423470/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423470/H...
PRJEB39602,SAMEA7762099,ERX4733969,ERR7423473,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423473/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423473/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423473/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423473/H...
PRJEB39602,SAMEA7249680,ERX4319154,ERR7423476,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...
PRJEB39602,SAMEA7249680,ERX4319154,ERR7423477,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...


Expand Table Rows such that each filetype (I1, R1 and R2) has its own row, sharing the sample name with eachother, but not sharing url_type, url and filename columns \
Create new columns url_type (describing the file type downloaded in this rows url) and filename (name of the file to be downloaded)

In [8]:
# Initialize an empty list to hold new rows
new_rows = []

# Iterate over the original DataFrame rows
for _, row in df.iterrows():
    # Create a new row for I1_url
    new_row_I1 = row.drop(labels=['R1_url', 'R2_url']).to_dict()
    new_row_I1['url_type'] = 'I1_url'
    new_row_I1['url'] = row['I1_url']
    new_rows.append(new_row_I1)

    # Create a new row for R1_url
    new_row_R1 = row.drop(labels=['I1_url', 'R2_url']).to_dict()
    new_row_R1['url_type'] = 'R1_url'
    new_row_R1['url'] = row['R1_url']
    new_rows.append(new_row_R1)

    # Create a new row for R2_url
    new_row_R2 = row.drop(labels=['I1_url', 'R1_url']).to_dict()
    new_row_R2['url_type'] = 'R2_url'
    new_row_R2['url'] = row['R2_url']
    new_rows.append(new_row_R2)

# Create a new DataFrame from the list of new rows
new_df = pd.DataFrame(new_rows)
new_df = new_df.drop(columns=['I1_url', 'R1_url', 'R2_url'])
new_df['filename'] = new_df['url'].str.split('/').str[-1]
new_df


Unnamed: 0,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,bam_ftp,url_type,url,filename
0,SAMEA7249749,ERX4319127,ERR6449746,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,I1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,HCAHeart7664653_S1_L001_I1_001.fastq.gz
1,SAMEA7249749,ERX4319127,ERR6449746,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,HCAHeart7664653_S1_L001_R1_001.fastq.gz
2,SAMEA7249749,ERX4319127,ERR6449746,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,HCAHeart7664653_S1_L001_R2_001.fastq.gz
3,SAMEA7249695,ERX4319157,ERR6449748,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,,I1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,HCAHeart7664652_S1_L001_I1_001.fastq.gz
4,SAMEA7249695,ERX4319157,ERR6449748,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,HCAHeart7664652_S1_L001_R1_001.fastq.gz
...,...,...,...,...,...,...,...,...,...,...,...,...
1531,SAMEA7249680,ERX4319154,ERR7423477,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,H0035_RV_S2_L004_R1_001.fastq.gz
1532,SAMEA7249680,ERX4319154,ERR7423477,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,H0035_RV_S2_L004_R2_001.fastq.gz
1533,SAMEA7762101,ERX4733977,ERR7423479,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,,,I1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,HCAHeart7702882_S1_L001_I1_001.fastq.gz
1534,SAMEA7762101,ERX4733977,ERR7423479,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,HCAHeart7702882_S1_L001_R1_001.fastq.gz


Remove rows containing I1 files

In [10]:
new_df = new_df[new_df['url_type'] != 'I1_url']

In [6]:
new_df.to_csv('.data/meta_data/downloads_table.csv')

## Inspect table in object

In [1]:
import experiment
import os


GEX_Experiment = experiment.TenX_GEX_Experiment(
    parent_dir=os.getcwd(),
    meta_data_path='meta_data/downloads_table.csv',
    sample_col='run_accession',
    file_name_col='filename',
    url_col='url',
    mapping_output='mapping_py'
)

GEX_Experiment.meta_data

Unnamed: 0_level_0,Unnamed: 0,sample_accession,experiment_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,bam_ftp,url_type,url,filename
run_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ERR6449746,1,SAMEA7249749,ERX4319127,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,HCAHeart7664653_S1_L001_R1_001.fastq.gz
ERR6449746,2,SAMEA7249749,ERX4319127,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449746/H...,HCAHeart7664653_S1_L001_R2_001.fastq.gz
ERR6449748,4,SAMEA7249695,ERX4319157,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,HCAHeart7664652_S1_L001_R1_001.fastq.gz
ERR6449748,5,SAMEA7249695,ERX4319157,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449748/H...,HCAHeart7664652_S1_L001_R2_001.fastq.gz
ERR6449751,7,SAMEA7249707,ERX4319184,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR644/ERR6449751/H...,HCAHeart7698016_S1_L001_R1_001.fastq.gz
...,...,...,...,...,...,...,...,...,...,...,...,...
ERR7423476,1529,SAMEA7249680,ERX4319154,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423476/H...,H0035_RV_S2_L003_R2_001.fastq.gz
ERR7423477,1531,SAMEA7249680,ERX4319154,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,H0035_RV_S2_L004_R1_001.fastq.gz
ERR7423477,1532,SAMEA7249680,ERX4319154,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,,,R2_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423477/H...,H0035_RV_S2_L004_R2_001.fastq.gz
ERR7423479,1534,SAMEA7762101,ERX4733977,9606,Homo sapiens,,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,,,R1_url,ftp.sra.ebi.ac.uk/vol1/run/ERR742/ERR7423479/H...,HCAHeart7702882_S1_L001_R1_001.fastq.gz


In [3]:
len(GEX_Experiment.meta_data.index.unique())

512