# Create sample map tsv files

- generate four TSVs that point from each sample to where the following files (generated by Step 3) exist:
	- `chrX_PAR1_sample_map.tsv`
		- For XY samples, map `sample_id` to `XY_X_PAR_hcVCF_gz` output from Step 3
		- For XX samples, map `sample_id` to `XX_X_hcVCF_gz` output from Step 3
	- `chrX_PAR2_sample_map.tsv`
		- Same as above
	- `chrX_non_PAR_sample_map.tsv`
		- For XY samples, map `sample_id` to `XY_X_non_PAR_hcVCF_gz` output from Step 3
		- For XX samples, map `sample_id` to `XX_X_hcVCF_gz` output from Step 3
	- `chrY_sample_map.tsv`
		- For XY samples, map `sample_id` to `XY_Y_nonPAR_hcVCF_gz` output from Step 3
		- Do not include XX samples


In [1]:
import subprocess
import pandas as pd

In [2]:
chrX_PAR1_sample_map = {}
chrX_PAR2_sample_map = {}
chrX_non_PAR_sample_map = {}
chrY_sample_map = {}

In [3]:
def map_samples(submission_tsv,
        chrX_PAR1_sample_map,
        chrX_PAR2_sample_map,
        chrX_non_PAR_sample_map,
        chrY_sample_map):

    submission_id = submission_tsv.strip('.tsv').split('/')[-1]

    submission_tsv_df = pd.read_csv(submission_tsv, sep='\t')
    all_samples = list(submission_tsv_df['asText'])

    for sample in all_samples:
        sample_split = sample.strip().replace('"','').split(' ')

        sex = sample_split[-1]
        sample_name = sample_split[3]
        workflow_id = sample_split[4]  #  d6795d02-f59e-48c0-89bf-51abc4d9160b

        # get gsutil urls
        bashCommand = "gsutil ls gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/submissions/" + submission_id + "/haplotype_calling_chrom/" + workflow_id + "/**/*vcf*"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        outputs, error = process.communicate()
        outputs = outputs.decode("utf-8").replace('\n',' ').strip().split()

        # assign to proper files
        if sex == 'male':
            candidate_chrX_PAR_sample_map_urls = []
            candidate_chrX_non_PAR_sample_map_urls = []
            candidate_chrY_sample_map_urls = []

            for output in outputs:
                # add to: chrX_PAR1_sample_map.tsv and chrX_PAR2_sample_map.tsv
                #   For XY samples, map sample_id to XY_X_PAR_hcVCF_gz output from Step 3
                if output.endswith('.chrX_PAR.hc.vcf.gz'):
                    candidate_chrX_PAR_sample_map_urls.append(output)

                # add to: chrX_non_PAR_sample_map.tsv
                #   For XY samples, map sample_id to XY_X_non_PAR_hcVCF_gz output from Step 3
                if output.endswith('.chrX_non_PAR.hc.vcf.gz'):
                    candidate_chrX_non_PAR_sample_map_urls.append(output)

                # add to: chrY_sample_map.tsv
                #   For XY samples, map sample_id to XY_Y_nonPAR_hcVCF_gz output from Step 3
                if output.endswith('.chrY.hc.vcf.gz'):
                    candidate_chrY_sample_map_urls.append(output)

            # select url from last attempt
            chrX_PAR_sample_map_url = sorted(candidate_chrX_PAR_sample_map_urls)[-1]
            chrX_non_PAR_sample_map_url = sorted(candidate_chrX_non_PAR_sample_map_urls)[-1]
            chrY_sample_map_url = sorted(candidate_chrY_sample_map_urls)[-1]

            # map sample_name to file
            chrX_PAR1_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_PAR2_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_non_PAR_sample_map[sample_name] = chrX_non_PAR_sample_map_url
            chrY_sample_map[sample_name] = chrY_sample_map_url

        elif sex == 'female':
            candidate_chrX_PAR_sample_map_urls = []
            for output in outputs:
                if output.endswith('.chrX.hc.vcf.gz'):
                    candidate_chrX_PAR_sample_map_urls.append(output)

            # select url from last attempt
            chrX_PAR_sample_map_url = sorted(candidate_chrX_PAR_sample_map_urls)[-1]

            # map sample_name to file
            chrX_PAR1_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_PAR2_sample_map[sample_name] = chrX_PAR_sample_map_url
            chrX_non_PAR_sample_map[sample_name] = chrX_PAR_sample_map_url
        else:
            raise Exception('Unlabeled sex')

    return chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map


In [4]:
submission_tsv_male = '/home/stephen/Downloads/259d8a3c-848c-424f-bdaa-863a2d26aa5f.tsv'
submission_tsv_female = '/home/stephen/Downloads/744be59c-07aa-4052-8c78-7105ae5e98da.tsv'

In [5]:
chrX_PAR1_sample_map = {}
chrX_PAR2_sample_map = {}
chrX_non_PAR_sample_map = {}
chrY_sample_map = {}

In [6]:
# map male samples
chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map = map_samples(submission_tsv_male,
                                                                                            chrX_PAR1_sample_map,
                                                                                            chrX_PAR2_sample_map,
                                                                                            chrX_non_PAR_sample_map,
                                                                                            chrY_sample_map)


In [7]:
# map female samples
chrX_PAR1_sample_map, chrX_PAR2_sample_map, chrX_non_PAR_sample_map, chrY_sample_map = map_samples(submission_tsv_female,
                                                                                            chrX_PAR1_sample_map,
                                                                                            chrX_PAR2_sample_map,
                                                                                            chrX_non_PAR_sample_map,
                                                                                            chrY_sample_map)


In [8]:
# files that we want
chrX_PAR1_sample_map_df = pd.DataFrame.from_dict(chrX_PAR1_sample_map, orient='index')
chrX_PAR2_sample_map_df = pd.DataFrame.from_dict(chrX_PAR2_sample_map, orient='index')
chrX_non_PAR_sample_map_df = pd.DataFrame.from_dict(chrX_non_PAR_sample_map, orient='index')
chrY_sample_map_df = pd.DataFrame.from_dict(chrY_sample_map, orient='index')

# view files
#chrX_PAR1_sample_map_df
#chrX_PAR2_sample_map_df
#chrX_non_PAR_sample_map_df
#chrY_sample_map_df

In [21]:
# write to tsv
base_dir = '/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps/'

chrX_PAR1_sample_map_df.to_csv(base_dir + 'chrX_PAR1_sample_map.tsv', sep="\t", header=False)
chrX_PAR2_sample_map_df.to_csv(base_dir + 'chrX_PAR2_sample_map.tsv', sep="\t", header=False)
chrX_non_PAR_sample_map_df.to_csv(base_dir + 'chrX_non_PAR_sample_map.tsv', sep="\t", header=False)
chrY_sample_map_df.to_csv(base_dir + 'chrY_sample_map.tsv', sep="\t", header=False)

In [None]:
%%bash

# upload tsv to gsutils bucket

# bucket
#https://console.cloud.google.com/storage/browser/fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70
# make new folder under 'uploads':  'sample_maps'

base_dir=/home/stephen/Documents/projects/schatz_lab/rotation/t2t-sgdp/data/sample_maps
gsutil cp $base_dir/* gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/uploads/sample_maps/

resulting gsutil urls

    gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/uploads/sample_maps/chrX_PAR1_sample_map.tsv
    gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/uploads/sample_maps/chrX_PAR2_sample_map.tsv
    gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/uploads/sample_maps/chrX_non_PAR_sample_map.tsv
    gs://fc-1b543b1d-8887-4a24-a4fd-37f06e9d9d70/uploads/sample_maps/chrY_sample_map.tsv
