# Yale data processing

- Data from Hedberg et al, 2016 (doi 10.1172/JCI82066)

- Annotation requires https://github.com/mskcc/vcf2maf.
    - Version used: v1.6.13.

In [1]:
# IMPORTS
from io import StringIO
import gzip

import pandas as pd

## Somatic mutations from supplementary data

- DNPs/DNVs merged in parse_hnc_contexts.ipynb in hnc_apobec/

In [2]:
df = pd.read_excel('/Users/sgg/Dropbox/Townsend/head_neck_data/JCI82066.sdt1-10.xlsx',
                   sheet_name="Table S10", header=[0,1])

df.columns = df.columns.droplevel(0)

df.index.name = 'patient_id'
df.columns.name = None
df.reset_index(inplace=True)

In [3]:
df.head()

Unnamed: 0,patient_id,Group,Variant Status,Status,Gene,AA change,Fisher,Chr,Position,Base Change,...,Adj Nonref Read Ratio,LOH in Primary,Ref Cov,Nonref Cov,Nonref Read Ratio,Adj Nonref Read Ratio.1,LOH in SLNM/MR,Ref cov,Nonref cov,Nonref allele
0,PY-1,Metachronous Recurrence,Transmitted,Coding-missense,NPC1,R711I,1e-06,chr18,21123532,C>A,...,0.204082,LOH,33,13,0.282609,1.34576,na,72,0,-
1,PY-1,Metachronous Recurrence,Transmitted,Coding-silent,SSH1,P199P,3.7e-05,chr12,109201543,G>A,...,0.285714,LOH,31,14,0.311111,1.48148,LOH,45,0,-
2,PY-1,Metachronous Recurrence,Primary Only,Coding-missense,C11orf16,H437Y,0.000456,chr11,8942958,G>A,...,0.201207,LOH,63,0,0.0,0.0,LOH,109,0,-
3,PY-1,Metachronous Recurrence,Transmitted,Coding-missense,SLCO2B1,I211S,0.0004,chr11,74880401,T>G,...,0.248016,LOH,79,5,0.059524,0.283447,LOH,80,0,-
4,PY-1,Metachronous Recurrence,Transmitted,Coding-missense,ATP8B4,A618V,7.7e-05,chr15,50212513,G>A,...,0.200437,LOH,125,11,0.080882,0.385154,LOH,127,0,-


In [4]:
df['Variant Status'].value_counts()

Transmitted                          1593
Primary Only                          368
Metachronous Recurrence Only          235
Synchronous Nodal Metastasis Only     208
Name: Variant Status, dtype: int64

## Extend annotation df to have row for each sample (not just each mutation)

- for every patient:
    - for each row (mutation), duplicate for samples specified by 'Variant Status'
    - combine new mutations into single dataframe

In [5]:
status_dict = {
    'Transmitted': ['T', 'R'],
    'Primary Only': ['T'],
    'Metachronous Recurrence Only': ['R'],
    'Synchronous Nodal Metastasis Only': ['R'],
}

def mut_series_from_status(s):
    """Return list of pd.Series annotations: one for each sample."""
    s_list = []
    sample_suffixes = status_dict[s['Variant Status']]
    for suffix in sample_suffixes:
        temp_s = s.copy()
        temp_s['sample_id'] = ''.join([s.patient_id, suffix])
        s_list.append(temp_s)
    return s_list


In [6]:
s_list = []
for ind, g in df.groupby('patient_id'):
    for ind, i in g.iterrows():
        s_list.extend(mut_series_from_status(i))


cols = ['sample_id'] + list(df.columns)

df = pd.concat(s_list, axis=1, ignore_index=True).transpose()[cols]

## PY-9T has no mutations

In [7]:
df.groupby('patient_id')['sample_id'].apply(lambda v: v.str[-1].value_counts())\
    .unstack().fillna(0).astype(int)

Unnamed: 0_level_0,R,T
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PY-1,86,87
PY-10,4,3
PY-11,31,33
PY-12,85,84
PY-13,6,47
PY-14,68,70
PY-15,436,413
PY-16,199,150
PY-17,205,256
PY-19,7,86


## Merge DNVs

In [8]:
# print(list(df.columns))

df = df[['sample_id', 'patient_id', 'Chr', 'Position', 'Base Change', 'Gene']].copy().reset_index()

df.rename(columns={'Chr': 'chrom', 'Position': 'pos',
                   'Base Change': 'change', 'Gene': 'symbol'}, inplace=True)


df['ref'], df['alt'] = zip(*df.change.apply(lambda v: v.split('>')))

# df.chrom = df.chrom.str.lstrip('chr')

df.drop('change', axis=1, inplace=True)

df.chrom = df.chrom.apply(lambda x: 'chrX' if x == 'chr24' else x)

# REMOVE DUPLICATE MUTATIONS
# df.groupby(['sample_id', 'chrom', 'pos']).filter(lambda d: len(d)>1)
cols = df.columns
df = df.groupby(['sample_id', 'chrom', 'pos']).first().reset_index()[cols]

df.drop('index', axis=1, inplace=True)

# IDENTIFY DUPLICATE MUTATIONS

# Remove second mutation in DNVs
df['pos_next'] = df.pos + 1

df['categ'] = 'SNV'
dnv_drop = df.merge(df.reset_index(), left_on=['sample_id', 'chrom', 'pos_next'], 
                    right_on=['sample_id', 'chrom', 'pos'], 
                    suffixes=['', '_right']).set_index('index')
drop_inds = dnv_drop.index.values
dnv = df.reset_index().merge(df.reset_index(), left_on=['sample_id', 'chrom', 'pos_next'], 
               right_on=['sample_id', 'chrom', 'pos'], 
               suffixes=['', '_right']).set_index('index')
dnv_inds = dnv.index.values

df.drop(drop_inds, inplace=True)
df.loc[dnv_inds, 'categ'] = 'DNV'

def drop_lower(v):
    if v[0].isupper() and v[1].islower():
        return v[0]
    elif v[1].isupper() and v[0].islower():
        return v[1]
    else:
        raise Exception('Bad value: {}'.format(v))

for i in df[(df.categ == 'SNV') & (dnv.ref.apply(len) > 1)].itertuples():
    ind = i.Index
    ref = drop_lower(i.ref)
    alt = drop_lower(i.alt)
    df.loc[ind, 'ref'] = ref
    df.loc[ind, 'alt'] = alt

df.ref = df.ref.str.upper()
df.alt = df.alt.str.upper()

df.drop('pos_next', axis=1, inplace=True)

In [9]:
df.categ.value_counts()

SNV    3949
DNV      23
Name: categ, dtype: int64

## Modify mutations table for use with maf2maf

In [10]:
df.symbol = df.symbol.astype(str)

In [11]:
rename_dict = {'chrom': 'Chromosome',
'pos': 'Start_Position',
'ref': 'Reference_Allele',
'alt': 'Tumor_Seq_Allele2',
'sample_id': 'Tumor_Sample_Barcode'}
df.rename(columns=rename_dict, inplace=True)

df.Chromosome = df.Chromosome.str[3:]
chroms = [str(i) for i in pd.np.arange(22) + 1] + ['X', 'Y']
df.Chromosome = df.Chromosome.astype(pd.Categorical(chroms, categories=chroms, ordered=True))

df = df.sort_values(['Chromosome', 'Start_Position', 'Tumor_Sample_Barcode']).reset_index()
df.drop('index', axis=1, inplace=True)
df.rename(columns={'Transcript_ID': 'Refseq_NM'}, inplace=True)

# All categories are SNV or DNV, so set end position accordingly
df['End_Position'] = df.Start_Position.where(df.categ == 'SNV', df.Start_Position + 1)

In [12]:
df.head()

Unnamed: 0,Tumor_Sample_Barcode,patient_id,Chromosome,Start_Position,symbol,Reference_Allele,Tumor_Seq_Allele2,categ,End_Position
0,PY-1R,PY-1,1,1325617,CCNL2,G,A,SNV,1325617
1,PY-1T,PY-1,1,1325617,CCNL2,G,A,SNV,1325617
2,PY-8R,PY-8,1,1887253,KIAA1751,C,T,SNV,1887253
3,PY-6R,PY-6,1,2103775,PRKCZ,C,T,SNV,2103775
4,PY-17T,PY-17,1,6100684,KCNAB2,G,A,SNV,6100684


## Export and reannotate
```python
df.to_csv('yale_for_anno.maf', sep='\t', index=False)
```

```bash
# RUN maf2maf.pl and gzip
maf2maf.pl --input-maf yale_for_anno.maf --output-maf yale_annotated.maf
gzip yale_annotated.maf
```

## Import annotated and filter

In [13]:
df = pd.read_table(gzip.open('output_data/yale_annotated.maf.gz'), comment='#')

In [14]:
# Remove recurrent tumors
df = df[df.Tumor_Sample_Barcode.str.endswith('T')].reset_index(drop=True)

In [15]:
df.FILTER.value_counts()

.                 1891
common_variant      56
Name: FILTER, dtype: int64

In [16]:
# Strip common variants
df = df[df.FILTER != 'common_variant'].reset_index(drop=True)

In [17]:
df.Variant_Classification.value_counts()

Missense_Mutation         1313
Silent                     428
Nonsense_Mutation           72
Splice_Site                 29
Splice_Region               15
Intron                      14
RNA                          8
3'Flank                      4
Translation_Start_Site       2
5'Flank                      2
Nonstop_Mutation             2
5'UTR                        1
3'UTR                        1
Name: Variant_Classification, dtype: int64

## Filter intronic and flanking mutations

In [18]:
df = df[~df.Variant_Classification.isin(['Intron', "3'Flank", "5'Flank"])].reset_index(drop=True)

In [19]:
df.Variant_Classification.value_counts()

Missense_Mutation         1313
Silent                     428
Nonsense_Mutation           72
Splice_Site                 29
Splice_Region               15
RNA                          8
Translation_Start_Site       2
Nonstop_Mutation             2
5'UTR                        1
3'UTR                        1
Name: Variant_Classification, dtype: int64

In [20]:
df.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,ExAC_AC_AN_Adj,ExAC_AC_AN,ExAC_AC_AN_AFR,ExAC_AC_AN_AMR,ExAC_AC_AN_EAS,ExAC_AC_AN_FIN,ExAC_AC_AN_NFE,ExAC_AC_AN_OTH,ExAC_AC_AN_SAS,ExAC_FILTER
0,DOCK7,85440,,GRCh37,1,63119695,63119695,+,Missense_Mutation,SNP,...,,,,,,,,,,
1,BCL10,8915,,GRCh37,1,85736573,85736573,+,Missense_Mutation,SNP,...,,,,,,,,,,
2,NR5A2,2494,,GRCh37,1,200012919,200012919,+,Missense_Mutation,SNP,...,,,,,,,,,,
3,EFCAB14,9813,,GRCh37,1,47182102,47182102,+,Missense_Mutation,SNP,...,,,,,,,,,,PASS
4,ITGB3BP,23421,,GRCh37,1,63912521,63912521,+,Missense_Mutation,SNP,...,2/106020,2/106200,0/9054,0/11192,0/7850,1/6596,1/54242,0/692,0/16394,PASS


In [21]:
df.to_csv('output_data/yale_filtered.maf', sep='\t', index=False)

# Reference aside: clinical data

OC, oral cavity
P, pharynx
L, larynx, LN, lymph node (cervical).

In [22]:
# Clinical data. Text extracted from pdf.

metachronous = """Patient ID	Sex	Age at Diagnosis (yr)	Location of Primary Site	Path. stage	AJCC stage	Adjuvant treatment	Time to recurrence (mo)	Metastasis site	patient_type
PY-1	M	78	OC	T3 N1 M0	III	None	2.3	P	metachronous_recurrent_tumor
PY-3	F	67	OC	T2 N2B M0	IVA	RT	6.3	OC	metachronous_recurrent_tumor
PY-4	M	44	?	T4A N2B M0	IVA	RT, Cisplatin, Vectibix	6.8	P	metachronous_recurrent_tumor
PY-5	M	52	OC	T3 N0 M0	III	None	4.3	L	metachronous_recurrent_tumor
PY-6	M	52	P	TX NX MX	NA	IMRT, cetuximab, pemetrexed	10.5	LN	metachronous_recurrent_tumor
PY-7	M	49	L	T4A N1 M0	IVA	None	2.9	LN	metachronous_recurrent_tumor
PY-8	M	76	OC	TX NX MX	NA	RT	33.9	P	metachronous_recurrent_tumor
PY-9	M	60	OC	T2 N0 MX	II	None	4.7	LN	metachronous_recurrent_tumor
PY-10	M	58	OC	TX NX MX	NA	None	2.2	OC	metachronous_recurrent_tumor
PY-11	M	74	OC	T3 N2C M0	IVA	RT, Cisplatin	9.7	OC	metachronous_recurrent_tumor"""

synchronous = """Patient ID	Sex	Age at Diagnosis (yr)	Location of Primary Site	Path. stage	AJCC stage	patient_type
PY-12	F	46	OC	T3 N2C M0	IVA	synchronous_nodal_metastasis
PY-13	M	53	OC	T3 N2B M0	IVA	synchronous_nodal_metastasis
PY-14	M	71	L	T3 N1 MX	III	synchronous_nodal_metastasis
PY-15	M	63	L	T3 N2C M0	IVA	synchronous_nodal_metastasis
PY-16	M	70	P	T1 N2B MX	IVA	synchronous_nodal_metastasis
PY-17	F	68	L	T4A N2A MX	IVA	synchronous_nodal_metastasis
PY-19	M	73	P	T2 N1 MX	III	synchronous_nodal_metastasis
PY-20	F	79	OC	T1 N1 MX	III	synchronous_nodal_metastasis
PY-21	M	49	L	T4A N2C MO	IVA	synchronous_nodal_metastasis
PY-22	M	49	OC	T4A N2C M0	IVA	synchronous_nodal_metastasis
PY-23	M	56	P	T4A N2B M0	IVA	synchronous_nodal_metastasis
PY-24	M	73	OC	T4A N2C M0	IVA	synchronous_nodal_metastasis
PY-25	M	64	OC	T1 N2 M0	IVA	synchronous_nodal_metastasis"""

s_met = pd.read_table(StringIO(metachronous))
s_syn = pd.read_table(StringIO(synchronous))
clinical = pd.concat([s_met, s_syn], ignore_index=True)

site_dict = {'OC': 'oral cavity',
'P': 'pharynx',
'L': 'larynx',
'LN': 'lymph node (cervical)'}

clinical['Metastasis site'] = clinical['Metastasis site'].apply(lambda v: site_dict[v] if type(v) == str else v)
clinical['Location of Primary Site'] = clinical['Location of Primary Site'].apply(lambda v: site_dict[v] if v != '?' else v)

clinical[s_met.columns].head()

Unnamed: 0,Patient ID,Sex,Age at Diagnosis (yr),Location of Primary Site,Path. stage,AJCC stage,Adjuvant treatment,Time to recurrence (mo),Metastasis site,patient_type
0,PY-1,M,78,oral cavity,T3 N1 M0,III,,2.3,pharynx,metachronous_recurrent_tumor
1,PY-3,F,67,oral cavity,T2 N2B M0,IVA,RT,6.3,oral cavity,metachronous_recurrent_tumor
2,PY-4,M,44,?,T4A N2B M0,IVA,"RT, Cisplatin, Vectibix",6.8,pharynx,metachronous_recurrent_tumor
3,PY-5,M,52,oral cavity,T3 N0 M0,III,,4.3,larynx,metachronous_recurrent_tumor
4,PY-6,M,52,pharynx,TX NX MX,,"IMRT, cetuximab, pemetrexed",10.5,lymph node (cervical),metachronous_recurrent_tumor
