For checking which bams are filled with wrong sample metadata.
See [this](https://github.com/broadinstitute/long-read-pipelines/issues/286).

# load library

In [18]:
# auto reloading of local scripts under dev
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# relying on these stdlib anyway
import http
import re
import os
import sys
import pandas as pd

In [20]:
# Google Cloud and FISS
from firecloud import api as fapi

from google.cloud import storage
storage_client = storage.Client()

In [21]:
# load local lib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.table_utils import *

In [22]:
import datetime
import dateutil
from dateutil import parser

# Filters to apply, still under development, so changes with time

In [23]:
print("Current Time =", datetime.datetime.now().strftime("%D %H:%M:%S"))

Current Time = 12/10/21 15:21:54


In [24]:
# last_check_date = ''

In [25]:
def filter_pacbio_flowcells(terra_table_row) -> bool:
    """
    :param terra_table_row:
    :return: true if the row should be kept
    """

    cutoff_date = pd.to_datetime(datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc))
    sequencing_date = terra_table_row['created_at']
    if sequencing_date.tzinfo is None:
        return False

    keep = sequencing_date >= cutoff_date

    # keep &= sequencing_date < last_check_date

    keep &= terra_table_row['aligned_bam'].startswith('gs://')

    return keep

# load data

In [26]:
primary_namespace = 'production-long-reads'
primary_workspace = 'broad-gp-pacbio'
root_data_type='sample'
flowcell_table = \
  fetch_existing_root_table(ns=primary_namespace,
                            ws=primary_workspace,
                            etype=root_data_type)

In [27]:
categorical_columns = {'type': 'category',
                       'columns': ['application', 'experiment_type', 'instrument', 'workspace']}

date_time_columns = {'type': 'datetime64',
                     'timezone': datetime.timezone.utc,
                     'columns': ['created_at']}

boolean_columns = {'type': 'bool',
                   'columns': ['is_ccs', 'is_corrected', 'is_isoseq']}

int_type_columns = {'type': 'int64',
                    'columns': ['insert_size']}

float_type_columns = {'type': 'float64',
                      'columns': ['lod_expected_sample']}

string_type_columns = {'type': 'str',
                       'columns': ['flowcell_id', 'bio_sample', 'description', 'well_sample', 'movie_name', 'well_name', 'sample']}

In [28]:
for n in boolean_columns['columns']:
    flowcell_table[n] = flowcell_table[n].apply(lambda s: s=='TRUE' or s=='True' or s=='true').astype(boolean_columns['type'])

In [29]:
for n in categorical_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(categorical_columns['type'])

In [30]:
for n in int_type_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(int_type_columns['type'])

In [31]:
for n in string_type_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(string_type_columns['type'])

In [32]:
for n in float_type_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(float_type_columns['type'])

In [33]:
def convert_date_time(s):
    try:
        t = parser.isoparse(s).astimezone(tz=date_time_columns['timezone'])
        return pd.to_datetime(t)
    except (ValueError, pd.errors.OutOfBoundsDatetime):
        return pd.Timestamp.min
for n in date_time_columns['columns']:
    flowcell_table[n] = flowcell_table[n].apply(lambda s: pd.to_datetime(convert_date_time(s)))

In [34]:
usable_flowcell_table = flowcell_table.loc[flowcell_table.apply(filter_pacbio_flowcells, axis=1),:].reset_index(drop=True)
usable_flowcell_table.shape

(510, 70)

# Check for inconsistency

In [41]:
def get_lb_and_header(bam_gs_path: str) -> dict():


    os.system(f"export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools view -H {bam_gs_path} | grep '@RG' > tmp.header.txt")
    rg_line = list()
    with open('tmp.header.txt') as reader:
        rg_line = reader.readlines()[0].split('\t')
    os.system('rm tmp.header.txt')

    meta = dict()
    for e in rg_line:
        if e.startswith('LB:'):
            meta['library'] = re.sub('^LB:', '', e)
        elif e.startswith('SM:'):
            meta['sample'] = re.sub('^SM:', '', e)

    return meta


In [44]:
usable_flowcell_table[['flowcell_id','aligned_bam']].to_csv("/Users/shuang/Desktop/bams.txt", index=False, sep=',', header=False)

In [45]:
metadata_from_bam_headers = pd.read_csv("/Users/shuang/Desktop/sample_metadata.tsv", sep='\t')

In [46]:
metadata_from_bam_headers.head()

Unnamed: 0,flowcell_id,LB,SM
0,DA103955,SM-K6JCH,1-06162
1,DA074276,SM-KTTFW,CDH141
2,DA074064,SM-V35Y,SM-V35Y
3,DA134109,SM-K6JD5,1-05888
4,DA134006,SM-K6JE6,1-05846


In [48]:
usable_flowcell_table[['flowcell_id', 'well_sample', 'bio_sample']].head()

Unnamed: 0,flowcell_id,well_sample,bio_sample
0,DA103955,SM-K6JCH,1-06162
1,DA074276,SM-KTTFW,CDH141
2,DA074064,SM-V35Y,SM-V35Y
3,DA134109,SM-K6JD5,1-05888
4,DA134006,SM-K6JE6,1-05846


In [49]:
len(metadata_from_bam_headers) == len(usable_flowcell_table)

True

In [53]:
bams_with_wrong_samples = list()

for i in range(len(metadata_from_bam_headers)):
    bam_row = metadata_from_bam_headers.iloc[i,:]
    terra_row = usable_flowcell_table.iloc[i,:][['flowcell_id', 'well_sample', 'bio_sample']]
    if bam_row[0] != terra_row[0]:
        raise ValueError(f"Flowcell ID don't match!\t{i}")
    if bam_row[1] != terra_row[1] or bam_row[2] != terra_row[2]:
        bams_with_wrong_samples.append(bam_row[0])

In [54]:
len(bams_with_wrong_samples)

6

In [60]:
usable_flowcell_table.loc[metadata_from_bam_headers['flowcell_id'].isin(bams_with_wrong_samples),['flowcell_id', 'well_sample', 'bio_sample', 'workspace']]

Unnamed: 0,flowcell_id,well_sample,bio_sample,workspace
60,DA073869,SM-KTTFJ,Jan-77,Gabriel_GMKFLRP_Chung_PacBio_FY20
80,DA074036,SM-KTSY3,Feb-12,Gabriel_GMKFLRP_Chung_PacBio_FY20
116,DA106896,SM-KTT1N,Jul-07,Gabriel_GMKFLRP_Chung_PacBio_FY20
292,DA074315,SM-KTTFJ,Jan-77,Gabriel_GMKFLRP_Chung_PacBio_FY20
329,DA074300,SM-KTSZV,Jun-07,Gabriel_GMKFLRP_Chung_PacBio_FY20
467,DA089806,64271e_CLR_SAT_20210615_A01,64271e_CLR_SAT_20210615_A01,Long Reads Delivery - Garimella


In [61]:
metadata_from_bam_headers.loc[metadata_from_bam_headers['flowcell_id'].isin(bams_with_wrong_samples),]

Unnamed: 0,flowcell_id,LB,SM
60,DA073869,SM-KTTFJ,01-1977
80,DA074036,SM-KTSY3,02-2012
116,DA106896,SM-KTT1N,07-2007
292,DA074315,SM-KTTFJ,01-1977
329,DA074300,SM-KTSZV,06-2007
467,DA089806,64271e_CLR_SAT_20210615_A01,UnnamedSample
