# Create sample map tsv files

- generate four sex-specific TSVs that point from each sample to where the following files (generated by Step 3) exist:
    - `chrX_PAR1_sample_map.tsv`
        - For XY samples, map `sample_id` to `XY_X_PAR_hcVCF_gz` output from Step 3
        - For XX samples, map `sample_id` to `XX_X_hcVCF_gz` output from Step 3
    - `chrX_PAR2_sample_map.tsv`
        - Same as above
    - `chrX_non_PAR_sample_map.tsv`
        - For XY samples, map `sample_id` to `XY_X_non_PAR_hcVCF_gz` output from Step 3
        - For XX samples, map `sample_id` to `XX_X_hcVCF_gz` output from Step 3
    - `chrY_sample_map.tsv`
        - For XY samples, map `sample_id` to `XY_Y_nonPAR_hcVCF_gz` output from Step 3
        - Do not include XX samples
- generate remainder mapping files for autosomomal chromosomes
    - `chr<#>_sample_map.tsv`

In [1]:
import subprocess
import pandas as pd

In [2]:
def map_sample_autosomal_chr(submission_tsv, autosomal_chr_map_list):
    chrms = list(range(22))
    
    submission_id = submission_tsv.strip('.tsv').split('/')[-1]
    submission_tsv_df = pd.read_csv(submission_tsv, sep='\t')
    all_samples = list(submission_tsv_df['asText'])

    for sample in all_samples:
        sample_split = sample.strip().replace('"','').split(' ')

        sample_name = sample_split[3]
        workflow_id = sample_split[4]

        # get gsutil urls
        bashCommand = "gsutil ls gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/submissions/" + submission_id + "/haplotype_calling_chrom/" + workflow_id + "/**/*vcf*"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        outputs, error = process.communicate()
        outputs = outputs.decode("utf-8").replace('\n',' ').strip().split()

        # assign autosomal chromosome maps
        candidate_chr_idx_sample_map_urls = [[] for _ in chrms]   # make empty list of lists

        for chrm_idx in chrms:            
            for output in outputs:
                if output.endswith( '.chr' + str(chrm_idx+1) + '.hc.vcf.gz'):
                    candidate_chr_idx_sample_map_urls[chrm_idx].append(output)

            # select url from last attempt and map sample_name to file
            autosomal_chr_map_list[chrm_idx][sample_name] = sorted(candidate_chr_idx_sample_map_urls[chrm_idx])[-1]

    return autosomal_chr_map_list


In [3]:
def map_sample_sex_chr(submission_tsv,
        chrX_PAR1_sample_map,
        chrX_PAR2_sample_map,
        chrX_non_PAR_sample_map,
        chrY_sample_map):

    submission_id = submission_tsv.strip('.tsv').split('/')[-1]

    submission_tsv_df = pd.read_csv(submission_tsv, sep='\t')
    all_samples = list(submission_tsv_df['asText'])

    for sample in all_samples:
        sample_split = sample.strip().replace('"','').split(' ')

        sex = sample_split[-1]
        sample_name = sample_split[3]
        workflow_id = sample_split[4]  #  d6795d02-f59e-48c0-89bf-51abc4d9160b

        # get gsutil urls
        bashCommand = "gsutil ls gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/submissions/" + submission_id + "/haplotype_calling_chrom/" + workflow_id + "/**/*vcf*"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        outputs, error = process.communicate()
        outputs = outputs.decode("utf-8").replace('\n',' ').strip().split()

        # assign to proper files
        if sex == 'male':
            candidate_chrX_PAR_sample_map_urls = []
            candidate_chrX_non_PAR_sample_map_urls = []
            candidate_chrY_sample_map_urls = []

            for output in outputs:
                # add to: chrX_PAR1_sample_map.tsv and chrX_PAR2_sample_map.tsv
                #   For XY samples, map sample_id to XY_X_PAR_hcVCF_gz output from Step 3
                if output.endswith('.chrX_PAR.hc.vcf.gz'):
                    candidate_chrX_PAR_sample_map_urls.append(output)

                # add to: chrX_non_PAR_sample_map.tsv
                #   For XY samples, map sample_id to XY_X_non_PAR_hcVCF_gz output from Step 3
                if output.endswith('.chrX_non_PAR.hc.vcf.gz'):
                    candidate_chrX_non_PAR_sample_map_urls.append(output)

                # add to: chrY_sample_map.tsv
                #   For XY samples, map sample_id to XY_Y_nonPAR_hcVCF_gz output from Step 3
                if output.endswith('.chrY.hc.vcf.gz'):
                    candidate_chrY_sample_map_urls.append(output)

            # select url from last attempt
            chrX_PAR_sample_map_url = sorted(candidate_chrX_PAR_sample_map_urls)[-1]
            chrX_non_PAR_sample_map_url = sorted(candidate_chrX_non_PAR_sample_map_urls)[-1]
            chrY_sample_map_url = sorted(candidate_chrY_sample_map_urls)[-1]

            # map sample_name to file
            chrX_PAR1_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_PAR2_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_non_PAR_sample_map[sample_name] = chrX_non_PAR_sample_map_url
            chrY_sample_map[sample_name] = chrY_sample_map_url

        elif sex == 'female':
            candidate_chrX_PAR_sample_map_urls = []
            for output in outputs:
                if output.endswith('.chrX.hc.vcf.gz'):
                    candidate_chrX_PAR_sample_map_urls.append(output)

            # select url from last attempt
            chrX_PAR_sample_map_url = sorted(candidate_chrX_PAR_sample_map_urls)[-1]

            # map sample_name to file
            chrX_PAR1_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_PAR2_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_non_PAR_sample_map[sample_name] = chrX_PAR_sample_map_url
        else:
            raise Exception('Unlabeled sex')

    return chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map


In [4]:
submission_tsv_male = '/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/25a197b7-7803-4ebb-9550-ec8187d04cbd.tsv'
submission_tsv_female = '/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/d8c61c51-1023-4024-89e2-5c3856a5237c.tsv'

In [5]:
# map autosomal chromosomes
chrms = list(range(22))
autosomal_chr_map_list = [{} for _ in chrms]

autosomal_chr_map_list = map_sample_autosomal_chr(submission_tsv_male, autosomal_chr_map_list)
autosomal_chr_map_list = map_sample_autosomal_chr(submission_tsv_female, autosomal_chr_map_list)

chr1_sample_map, chr2_sample_map, chr3_sample_map, chr4_sample_map, chr5_sample_map, chr6_sample_map, chr7_sample_map, chr8_sample_map, chr9_sample_map, chr10_sample_map, chr11_sample_map, chr12_sample_map, chr13_sample_map, chr14_sample_map, chr15_sample_map, chr16_sample_map, chr17_sample_map, chr18_sample_map, chr19_sample_map, chr20_sample_map, chr21_sample_map, chr22_sample_map = autosomal_chr_map_list

In [6]:
# files that we want
chr1_sample_map_df  = pd.DataFrame.from_dict(chr1_sample_map, orient='index')
chr2_sample_map_df  = pd.DataFrame.from_dict(chr2_sample_map, orient='index')
chr3_sample_map_df  = pd.DataFrame.from_dict(chr3_sample_map, orient='index')
chr4_sample_map_df  = pd.DataFrame.from_dict(chr4_sample_map, orient='index')
chr5_sample_map_df  = pd.DataFrame.from_dict(chr5_sample_map, orient='index')
chr6_sample_map_df  = pd.DataFrame.from_dict(chr6_sample_map, orient='index')
chr7_sample_map_df  = pd.DataFrame.from_dict(chr7_sample_map, orient='index')
chr8_sample_map_df  = pd.DataFrame.from_dict(chr8_sample_map, orient='index')
chr9_sample_map_df  = pd.DataFrame.from_dict(chr9_sample_map, orient='index')
chr10_sample_map_df = pd.DataFrame.from_dict(chr10_sample_map, orient='index')
chr11_sample_map_df = pd.DataFrame.from_dict(chr11_sample_map, orient='index')
chr12_sample_map_df = pd.DataFrame.from_dict(chr12_sample_map, orient='index')
chr13_sample_map_df = pd.DataFrame.from_dict(chr13_sample_map, orient='index')
chr14_sample_map_df = pd.DataFrame.from_dict(chr14_sample_map, orient='index')
chr15_sample_map_df = pd.DataFrame.from_dict(chr15_sample_map, orient='index')
chr16_sample_map_df = pd.DataFrame.from_dict(chr16_sample_map, orient='index')
chr17_sample_map_df = pd.DataFrame.from_dict(chr17_sample_map, orient='index')
chr18_sample_map_df = pd.DataFrame.from_dict(chr18_sample_map, orient='index')
chr19_sample_map_df = pd.DataFrame.from_dict(chr19_sample_map, orient='index')
chr20_sample_map_df = pd.DataFrame.from_dict(chr20_sample_map, orient='index')
chr21_sample_map_df = pd.DataFrame.from_dict(chr21_sample_map, orient='index')
chr22_sample_map_df = pd.DataFrame.from_dict(chr22_sample_map, orient='index')

In [7]:
# write to tsv
base_dir = '/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/'

chr1_sample_map_df.to_csv(base_dir  + 'chr1_sample_map.tsv',  sep="\t", header=False)
chr2_sample_map_df.to_csv(base_dir  + 'chr2_sample_map.tsv',  sep="\t", header=False)
chr3_sample_map_df.to_csv(base_dir  + 'chr3_sample_map.tsv',  sep="\t", header=False)
chr4_sample_map_df.to_csv(base_dir  + 'chr4_sample_map.tsv',  sep="\t", header=False)
chr5_sample_map_df.to_csv(base_dir  + 'chr5_sample_map.tsv',  sep="\t", header=False)
chr6_sample_map_df.to_csv(base_dir  + 'chr6_sample_map.tsv',  sep="\t", header=False)
chr7_sample_map_df.to_csv(base_dir  + 'chr7_sample_map.tsv',  sep="\t", header=False)
chr8_sample_map_df.to_csv(base_dir  + 'chr8_sample_map.tsv',  sep="\t", header=False)
chr9_sample_map_df.to_csv(base_dir  + 'chr9_sample_map.tsv',  sep="\t", header=False)
chr10_sample_map_df.to_csv(base_dir + 'chr10_sample_map.tsv', sep="\t", header=False)
chr11_sample_map_df.to_csv(base_dir + 'chr11_sample_map.tsv', sep="\t", header=False)
chr12_sample_map_df.to_csv(base_dir + 'chr12_sample_map.tsv', sep="\t", header=False)
chr13_sample_map_df.to_csv(base_dir + 'chr13_sample_map.tsv', sep="\t", header=False)
chr14_sample_map_df.to_csv(base_dir + 'chr14_sample_map.tsv', sep="\t", header=False)
chr15_sample_map_df.to_csv(base_dir + 'chr15_sample_map.tsv', sep="\t", header=False)
chr16_sample_map_df.to_csv(base_dir + 'chr16_sample_map.tsv', sep="\t", header=False)
chr17_sample_map_df.to_csv(base_dir + 'chr17_sample_map.tsv', sep="\t", header=False)
chr18_sample_map_df.to_csv(base_dir + 'chr18_sample_map.tsv', sep="\t", header=False)
chr19_sample_map_df.to_csv(base_dir + 'chr19_sample_map.tsv', sep="\t", header=False)
chr20_sample_map_df.to_csv(base_dir + 'chr20_sample_map.tsv', sep="\t", header=False)
chr21_sample_map_df.to_csv(base_dir + 'chr21_sample_map.tsv', sep="\t", header=False)
chr22_sample_map_df.to_csv(base_dir + 'chr22_sample_map.tsv', sep="\t", header=False)

In [8]:
chrX_PAR1_sample_map = {}
chrX_PAR2_sample_map = {}
chrX_non_PAR_sample_map = {}
chrY_sample_map = {}

In [9]:
# map male samples
chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map = map_sample_sex_chr(submission_tsv_male,
                                                                                            chrX_PAR1_sample_map,
                                                                                            chrX_PAR2_sample_map,
                                                                                            chrX_non_PAR_sample_map,
                                                                                            chrY_sample_map)

# map female samples
chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map = map_sample_sex_chr(submission_tsv_female,
                                                                                            chrX_PAR1_sample_map,
                                                                                            chrX_PAR2_sample_map,
                                                                                            chrX_non_PAR_sample_map,
                                                                                            chrY_sample_map)

In [10]:
# files that we want
chrX_PAR1_sample_map_df = pd.DataFrame.from_dict(chrX_PAR1_sample_map, orient='index')
chrX_PAR2_sample_map_df = pd.DataFrame.from_dict(chrX_PAR2_sample_map, orient='index')
chrX_non_PAR_sample_map_df = pd.DataFrame.from_dict(chrX_non_PAR_sample_map, orient='index')
chrY_sample_map_df = pd.DataFrame.from_dict(chrY_sample_map, orient='index')

In [11]:
# write to tsv
base_dir = '/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/'

chrX_PAR1_sample_map_df.to_csv(base_dir + 'chrX_PAR1_sample_map.tsv', sep="\t", header=False)
chrX_PAR2_sample_map_df.to_csv(base_dir + 'chrX_PAR2_sample_map.tsv', sep="\t", header=False)
chrX_non_PAR_sample_map_df.to_csv(base_dir + 'chrX_non_PAR_sample_map.tsv', sep="\t", header=False)
chrY_sample_map_df.to_csv(base_dir + 'chrY_sample_map.tsv', sep="\t", header=False)

In [12]:
# view files
chrX_PAR1_sample_map_df
#chrX_PAR2_sample_map_df
#chrX_non_PAR_sample_map_df
#chrY_sample_map_df

Unnamed: 0,0
simons_data_sample_12,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_139,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_207,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_22,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_306,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_11,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_132,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_14,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_186,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...
simons_data_sample_45,gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/s...


In [12]:
%%bash

# upload tsv to gsutils bucket

# bucket
#https://console.cloud.google.com/storage/browser/fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70
# make new folder under 'uploads':  'sample_maps'

base_dir=/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps
bucket_id=fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70
gsutil cp $base_dir/* gs://$bucket_id/uploads/sample_maps/

Copying file:///home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/chr10_sample_map.tsv [Content-Type=text/tab-separated-values]...
Copying file:///home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/chr11_sample_map.tsv [Content-Type=text/tab-separated-values]...
Copying file:///home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/chr12_sample_map.tsv [Content-Type=text/tab-separated-values]...
Copying file:///home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/chr13_sample_map.tsv [Content-Type=text/tab-separated-values]...
- [4 files][  9.0 KiB/  9.0 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file:///home/stephen/Do