In [3]:
import pandas as pd
import numpy as np
import os

In [None]:
# Source path for the CSV file
PATH_DATA = "/Users/taewoojung/Documents/PhD/Papers/GenomeMedicine/Data/"

In [None]:
def process_tNGS(targetedDNA, patID):
    columns = ["Sample ID", "Patient ID", "patient_id", "sample_id", "chrom", "pos", "genes", "ref", "alt", "N_DP", "N_VAF", "N_ALT", "T_DP", "T_VAF", "T_ALT",
           "somatic_status", "somatic_p_value", "type"]
    sample_ids = []
    sample_ids_full = []
    refs = []
    alts = []

    targetedDNA["Patient ID"] = "HR-{}".format(patID)

    for _, row in targetedDNA.iterrows():
        sample_id = row["sampleID_y"].split("HR"+patID.split("ID")[1])[1]
        if "MLN" not in row["sampleID_y"]:
            sample_id = "RP" + sample_id

        sample_ids.append(sample_id) 
        sample_ids_full.append("HR_{}_{}".format(patID, sample_id))

        # substitution
        ref = row["VC"].split("_")[-2]
        alt = row["VC"].split("_")[-1]

        if "+" in alt:  # deletion
            alt = ref + alt[1:]
        
        elif "-" in alt:    # insertion
            ref = row["VC"].split("_")[-1]
            alt = row["VC"].split("_")[-2]
            ref = alt + ref[1:]

        refs.append(ref)
        alts.append(alt)

    targetedDNA["sample_id"] = sample_ids
    targetedDNA["Sample ID"] = sample_ids_full
    targetedDNA["ref"] = refs
    targetedDNA["alt"] = alts
    targetedDNA = targetedDNA[columns]

    return targetedDNA

In [182]:
# Locally advanced cohort
LN_pat = ["ID4", "ID5", "ID6", "ID9", "ID10"]
columns = ["patient_id", "sample_id", "chrom", "pos", "ref", "alt", "N_DP", "N_VAF", "N_ALT", "T_DP", "T_VAF", "T_ALT",
           "somatic_status", "somatic_p_value", "type", "genes"]

patID = LN_pat[0]
targetedDNA = pd.read_csv(os.path.join(PATH_DATA, "tNGS_LN_VarScan2_results/{}_res.csv".format(patID)))
targetedDNA["patient_id"] = patID

targetedDNA = process_tNGS(targetedDNA, patID)

for patID in LN_pat[1:]:
    tmp = pd.read_csv(os.path.join(PATH_DATA, "tNGS_LN_VarScan2_results/{}_res.csv".format(patID)))
    tmp["patient_id"] = patID
    tmp = process_tNGS(tmp, patID)

    targetedDNA = pd.concat([targetedDNA, tmp], ignore_index=True)

In [183]:
for patID in LN_pat:
    sample_ids = sorted(list(set(targetedDNA[targetedDNA["patient_id"] == patID]["sample_id"].values)))
    MLN_samples = [sample_id for sample_id in sample_ids if "MLN" in sample_id]
    RP_samples = [sample_id for sample_id in sample_ids if "RP" in sample_id]

    overlapping_muts = []

    tmp = targetedDNA[targetedDNA["patient_id"] == patID]

    for i, RP_sample in enumerate(RP_samples):
        overlaps = []

        for j, MLN_sample in enumerate(MLN_samples):
            overlap = list(set(tmp[tmp["sample_id"] == RP_sample]["pos"].values) & set(tmp[tmp["sample_id"] == MLN_sample]["pos"].values))
            overlaps.append(len(overlap))

        overlapping_muts.append(overlaps.copy())

    print("{}:\n\t{}\n\t{}".format(patID, '{} x {}'.format(MLN_samples, RP_samples), overlapping_muts))

ID4:
	['MLN1', 'MLN2', 'MLN3'] x ['RP1', 'RP2', 'RP3', 'RP4', 'RP5']
	[[6, 1, 2], [4, 4, 3], [7, 2, 2], [3, 3, 2], [3, 1, 1]]
ID5:
	['MLN1', 'MLN2', 'MLN3'] x ['RP1', 'RP2', 'RP3', 'RP4']
	[[15, 12, 17], [14, 15, 15], [6, 4, 4], [8, 4, 9]]
ID6:
	['MLN1', 'MLN2', 'MLN3'] x ['RP1', 'RP2', 'RP3', 'RP4']
	[[1, 0, 2], [5, 1, 3], [3, 2, 3], [3, 1, 3]]
ID9:
	['MLN1', 'MLN2'] x ['RP1', 'RP3', 'RP4', 'RP5']
	[[3, 1], [1, 0], [1, 1], [3, 0]]
ID10:
	['MLN1', 'MLN2'] x ['RP1', 'RP2', 'RP3', 'RP4', 'RP5', 'RP6']
	[[1, 1], [1, 1], [2, 1], [5, 1], [2, 1], [3, 2]]


In [189]:
pos_to_genes = {}

for i, row in targetedDNA.iterrows():
    pos = row["pos"]
    genes = row["genes"]
    if pos not in pos_to_genes:
        pos_to_genes[pos] = genes

In [190]:
for patID in LN_pat:
    sample_ids = sorted(list(set(targetedDNA[targetedDNA["patient_id"] == patID]["sample_id"].values)))
    MLN_samples = [sample_id for sample_id in sample_ids if "MLN" in sample_id]
    RP_samples = [sample_id for sample_id in sample_ids if "RP" in sample_id]

    overlapping_muts = []

    tmp = targetedDNA[targetedDNA["patient_id"] == patID]

    for i, RP_sample in enumerate(RP_samples):
        overlaps = []

        for j, MLN_sample in enumerate(MLN_samples):
            overlap = list(set(tmp[tmp["sample_id"] == RP_sample]["pos"].values) & set(tmp[tmp["sample_id"] == MLN_sample]["pos"].values))
            overlap = [pos_to_genes[pos] for pos in overlap if pos in pos_to_genes]
            overlaps.append(overlap)

        overlapping_muts.append(overlaps.copy())

    print("{}:\n\t{}\n\t{}".format(patID, '{} x {}'.format(MLN_samples, RP_samples), overlapping_muts))

ID4:
	['MLN1', 'MLN2', 'MLN3'] x ['RP1', 'RP2', 'RP3', 'RP4', 'RP5']
	[[['PTEN', nan, 'ERG', 'BRCA2', 'GPC6', 'AC100802.3'], ['GPC6'], ['MTMR8', 'GPC6']], [['GPC6', 'GPC6', 'AC100802.3', 'BRCA2'], ['RP11-705C15.5', 'GPC6', 'CDK12', nan], ['GPC6', 'SUSD1', nan]], [['BRCA2', 'PTEN', 'ERG', 'CDK12', 'RB1', 'GPC6', 'BRCA2'], ['GPC6', 'BRCA1'], ['GPC6', 'SUSD1']], [['GPC6', 'PTEN', 'AR'], [nan, 'GPC6', 'RP11-705C15.5'], ['GPC6', 'SUSD1']], [['GPC6', 'BRCA2', 'BRCA2'], ['GPC6'], ['GPC6']]]
ID5:
	['MLN1', 'MLN2', 'MLN3'] x ['RP1', 'RP2', 'RP3', 'RP4']
	[[['ATM', 'BRCA2', 'ATM', 'ERG', 'KMT2C', 'GOLPH3L', nan, 'ATM', 'FOXA1', 'ERG', 'SETD2', 'AR', 'BRAF', nan, 'RB1'], ['ATM', 'CDK12', 'ATM', 'ERG', 'KMT2C', 'GOLPH3L', 'FOXA1', 'AR', 'RB1', 'ABCC5', nan, 'RB1'], ['ATM', 'ATM', 'ATM', 'FOXA1', 'KMT2C', 'GOLPH3L', nan, 'FOXA1', 'MSH2', 'ATP10A', 'AR', 'ERG', 'RB1', nan, 'BRAF', 'RB1', 'BRCA1']], [['BRCA2', 'ATM', 'RB1', 'ERG', 'KMT2C', 'GOLPH3L', 'FOXA1', 'SETD2', 'AR', 'RB1', 'BRAF', nan, 'RB1',

In [None]:
# Save the processed data
# targetedDNA.loc[:,~targetedDNA.columns.isin(["patient_id", "sample_id"])].to_csv(os.path.join(PATH_DATA, "tNGS_locally_advanced_variant_calling.csv"), index=False)
# pd.Series(list(set(targetedDNA["genes"]))).to_csv(PATH_DATA + "tNGS_gene_panel_locally_advanced.csv")

In [None]:
# De novo metastatic cohort
targetedDNA = pd.read_csv(PATH_DATA + "M1RP_metastasis_all.tsv", sep="\t")
targetedDNA.index = targetedDNA["GENE"]+"_"+targetedDNA["CHROM"]+"_"+targetedDNA["POSITION"].astype(str)+'_'+targetedDNA["REF"]+"/"+targetedDNA["ALT"]
cols = [x for x in targetedDNA if "cfDNA" not in x and "WBC" not in x and "PB" not in x]
targetedDNA = targetedDNA[cols]

In [None]:
# pd.Series(list(set(targetedDNA["GENE"]))).to_csv(PATH_DATA + "tNGS_gene_panel_de_novo.csv")

In [136]:
# patient_id for which we have multiple RP and a matching MLN sample
DN_pat = ["ID3", "ID4", "ID8", "ID23", "ID26", "ID33"] 

In [137]:
for pat in DN_pat:
    patDNA = targetedDNA[[x for x in targetedDNA if pat+"_" in x]]
    patinfo = pd.DataFrame()
    for c in patDNA.columns:
        sample = pd.DataFrame(patDNA[c])
        sample.columns = ["info"]
        sample["sampleID"] = c
        patinfo = pd.concat([patinfo, sample], axis=0)
    patinfo["VAF"] = patinfo["info"].str.split("%", expand=True)[0].astype(float)
    patinfo["DP"] = patinfo["info"].str.split('(', expand=True)[1].str.split(")", expand=True)[0].astype(int)
    patinfo["MQ"] = patinfo["info"].str.split('mq:', expand=True)[1].str.split(",", expand=True)[0]
    patinfo["sign"] = ["*" in x for x in patinfo["info"]]
    patinfo = patinfo.dropna()
    patinfo["DP"] = patinfo["DP"].astype(int)
    patinfo["MQ"] = patinfo["MQ"].astype(int)
    patinfo["ALT"] = (patinfo["VAF"]/100*patinfo["DP"]).round(0)
    patinfo = patinfo[~patinfo.index.isna()]
    #patinfo_filt = patinfo[(patinfo["ALT"]>5)&(patinfo["MQ"]>20)&(patinfo["VAF"]>10)&(patinfo["DP"]>50)]
    patinfo_filt = patinfo[(patinfo["MQ"]>20)&(patinfo["VAF"]>10)&(patinfo["DP"]>50)]
    if pat == "ID3":
        patinfo_filt_ID3 = patinfo_filt
    elif pat=="ID4":
        patinfo_filt_ID4 = patinfo_filt
    elif pat=="ID8":
        patinfo_filt_ID8 = patinfo_filt
    elif pat=="ID23":
        patinfo_filt_ID23 = patinfo_filt
    elif pat=="ID26":
        patinfo_filt_ID26 = patinfo_filt
    elif pat=="ID33":
        patinfo_filt_ID33 = patinfo_filt

In [138]:
for x in patinfo_filt_ID3.index.value_counts().reset_index()["index"]:
    print(x)
    print(patinfo_filt_ID3.loc[x])

TP53_chr17_7675075_A/G
                                                    info       sampleID   VAF  \
TP53_chr17_7675075_A/G  18.9% (665) [mq:42,bq:0,sd:31] *  M1RP_ID3_MLN1  18.9   
TP53_chr17_7675075_A/G  78.6% (425) [mq:42,bq:0,sd:36] *  M1RP_ID3_MLN2  78.6   
TP53_chr17_7675075_A/G  79.8% (501) [mq:42,bq:0,sd:34] *  M1RP_ID3_MLN5  79.8   

                         DP  MQ  sign    ALT  
TP53_chr17_7675075_A/G  665  42  True  126.0  
TP53_chr17_7675075_A/G  425  42  True  334.0  
TP53_chr17_7675075_A/G  501  42  True  400.0  
KDM6A_chrX_45053940_G/A
info        54.7% (329) [mq:37,bq:0,sd:36] *
sampleID                        M1RP_ID3_RP4
VAF                                     54.7
DP                                       329
MQ                                        37
sign                                    True
ALT                                    180.0
Name: KDM6A_chrX_45053940_G/A, dtype: object


In [14]:
patinfo_filt_ID4 = patinfo_filt_ID4[patinfo_filt_ID4["sampleID"].isin(['M1RP_ID4_MLN2', 'M1RP_ID4_RP2', 'M1RP_ID4_RP9'])]
for x in patinfo_filt_ID4.index.value_counts().reset_index()["index"]:
    print(x)
    print(patinfo_filt_ID4.loc[x])

FANCE_chr6_35466866_CAA/C
                                                       info       sampleID  \
FANCE_chr6_35466866_CAA/C  49.1% (116) [mq:34,bq:0,sd:32] *  M1RP_ID4_MLN2   
FANCE_chr6_35466866_CAA/C  36.5% (386) [mq:37,bq:0,sd:34] *   M1RP_ID4_RP2   
FANCE_chr6_35466866_CAA/C  44.9% (376) [mq:34,bq:0,sd:31] *   M1RP_ID4_RP9   

                            VAF   DP  MQ  sign    ALT  
FANCE_chr6_35466866_CAA/C  49.1  116  34  True   57.0  
FANCE_chr6_35466866_CAA/C  36.5  386  37  True  141.0  
FANCE_chr6_35466866_CAA/C  44.9  376  34  True  169.0  
KMT2D_chr12_49049781_GTCAT/G
                                                           info  \
KMT2D_chr12_49049781_GTCAT/G   44.5% (694) [mq:39,bq:0,sd:32] *   
KMT2D_chr12_49049781_GTCAT/G   33.5% (713) [mq:39,bq:0,sd:33] *   
KMT2D_chr12_49049781_GTCAT/G  35.9% (1473) [mq:37,bq:0,sd:31] *   

                                   sampleID   VAF    DP  MQ  sign    ALT  
KMT2D_chr12_49049781_GTCAT/G  M1RP_ID4_MLN2  44.5   694  39  Tru

In [15]:
print(len(set(patinfo_filt_ID4.index[patinfo_filt_ID4["sampleID"]=="M1RP_ID4_RP2"]).intersection(
    patinfo_filt_ID4.index[patinfo_filt_ID4["sampleID"]=="M1RP_ID4_MLN2"])), 
      len(set(patinfo_filt_ID4.index[patinfo_filt_ID4["sampleID"]=="M1RP_ID4_RP9"]).intersection(
    patinfo_filt_ID4.index[patinfo_filt_ID4["sampleID"]=="M1RP_ID4_MLN2"])))

4 4


In [16]:
patinfo_filt_ID8 = patinfo_filt_ID8[patinfo_filt_ID8['sampleID'].isin(['M1RP_ID8_MLN2', 
                                                                      'M1RP_ID8_MLN3', 'M1RP_ID8_MLN4', 
                                                                      'M1RP_ID8_MLN8', 'M1RP_ID8_MLN9', 
                                                                      'M1RP_ID8_RP2', 'M1RP_ID8_RP1', 
                                                                      'M1RP_ID8_RP4', 'M1RP_ID8_RP7'])]
for x in patinfo_filt_ID8.index.value_counts().reset_index()["index"]:
    print(x)
    print(patinfo_filt_ID8.loc[x])

ATM_chr11_108227318_T/G
                                                     info       sampleID  \
ATM_chr11_108227318_T/G  42.0% (317) [mq:39,bq:0,sd:35] *  M1RP_ID8_MLN2   
ATM_chr11_108227318_T/G  36.9% (331) [mq:41,bq:0,sd:31] *  M1RP_ID8_MLN3   
ATM_chr11_108227318_T/G    40.9% (416) [mq:41,bq:0,sd:29]  M1RP_ID8_MLN4   
ATM_chr11_108227318_T/G    24.3% (493) [mq:38,bq:0,sd:29]  M1RP_ID8_MLN8   
ATM_chr11_108227318_T/G  46.4% (420) [mq:40,bq:0,sd:33] *  M1RP_ID8_MLN9   
ATM_chr11_108227318_T/G  33.0% (333) [mq:38,bq:0,sd:31] *   M1RP_ID8_RP1   
ATM_chr11_108227318_T/G    20.0% (360) [mq:38,bq:0,sd:26]   M1RP_ID8_RP2   
ATM_chr11_108227318_T/G  21.5% (427) [mq:39,bq:0,sd:32] *   M1RP_ID8_RP4   
ATM_chr11_108227318_T/G  39.7% (287) [mq:36,bq:0,sd:31] *   M1RP_ID8_RP7   

                          VAF   DP  MQ   sign    ALT  
ATM_chr11_108227318_T/G  42.0  317  39   True  133.0  
ATM_chr11_108227318_T/G  36.9  331  41   True  122.0  
ATM_chr11_108227318_T/G  40.9  416  41  False  170

In [17]:
print(len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP1"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN2"])), 
      len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP2"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN2"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP4"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN2"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP7"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN2"])))

print(len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP1"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN3"])), 
      len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP2"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN3"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP4"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN3"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP7"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN3"])))

print(len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP1"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN4"])), 
      len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP2"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN4"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP4"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN4"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP7"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN4"])))

print(len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP1"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN8"])), 
      len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP2"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN8"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP4"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN8"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP7"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN8"])))

print(len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP1"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN9"])), 
      len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP2"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN9"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP4"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN9"])),
     len(set(patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_RP7"]).intersection(
    patinfo_filt_ID8.index[patinfo_filt_ID8["sampleID"]=="M1RP_ID8_MLN9"])))

1 1 1 1
1 13 1 13
1 13 1 13
3 1 3 5
1 13 1 13


In [18]:
for x in patinfo_filt_ID23.index.value_counts().reset_index()["index"]:
    print(x)
    print(patinfo_filt_ID23.loc[x])

RUNX1_chr21_34892926_G/A
                                                      info        sampleID  \
RUNX1_chr21_34892926_G/A    25.4% (138) [mq:28,bq:0,sd:34]  M1RP_ID23_RP14   
RUNX1_chr21_34892926_G/A    36.2% (489) [mq:40,bq:0,sd:26]   M1RP_ID23_RP1   
RUNX1_chr21_34892926_G/A  31.2% (349) [mq:38,bq:0,sd:34] *   M1RP_ID23_RP6   
RUNX1_chr21_34892926_G/A  26.3% (433) [mq:38,bq:0,sd:31] *   M1RP_ID23_RP7   
RUNX1_chr21_34892926_G/A  32.9% (277) [mq:37,bq:0,sd:32] *   M1RP_ID23_RP8   

                           VAF   DP  MQ   sign    ALT  
RUNX1_chr21_34892926_G/A  25.4  138  28  False   35.0  
RUNX1_chr21_34892926_G/A  36.2  489  40  False  177.0  
RUNX1_chr21_34892926_G/A  31.2  349  38   True  109.0  
RUNX1_chr21_34892926_G/A  26.3  433  38   True  114.0  
RUNX1_chr21_34892926_G/A  32.9  277  37   True   91.0  
PIK3CA_chr3_179210192_T/C
                                                       info        sampleID  \
PIK3CA_chr3_179210192_T/C  17.8% (404) [mq:40,bq:0,sd:30] *  M1RP

In [19]:
print(len(set(patinfo_filt_ID23.index[patinfo_filt_ID23["sampleID"]=="M1RP_ID23_RP4"]).intersection(
    patinfo_filt_ID23.index[patinfo_filt_ID23["sampleID"]=="M1RP_ID23_MLN1"])), 
      len(set(patinfo_filt_ID23.index[patinfo_filt_ID23["sampleID"]=="M1RP_ID23_RP9"]).intersection(
    patinfo_filt_ID23.index[patinfo_filt_ID23["sampleID"]=="M1RP_ID23_MLN1"])))

0 0


In [20]:
patinfo_filt_ID26 = patinfo_filt_ID26[patinfo_filt_ID26["sampleID"].isin(['M1RP_ID26_MLN1', 'M1RP_ID26_RP3', 
                                                                          'M1RP_ID26_RP1'])]
for x in patinfo_filt_ID26.index.value_counts().reset_index()["index"]:
    print(x)
    print(patinfo_filt_ID26.loc[x])

APC_chr5_112840254_G/GA
                                                     info        sampleID  \
APC_chr5_112840254_G/GA   13.2% (76) [mq:26,bq:0,sd:23] *  M1RP_ID26_MLN1   
APC_chr5_112840254_G/GA  37.5% (309) [mq:22,bq:0,sd:29] *   M1RP_ID26_RP3   

                          VAF   DP  MQ  sign    ALT  
APC_chr5_112840254_G/GA  13.2   76  26  True   10.0  
APC_chr5_112840254_G/GA  37.5  309  22  True  116.0  
HSD3B1_chr1_119507798_T/G
                                                       info       sampleID  \
HSD3B1_chr1_119507798_T/G  29.1% (141) [mq:38,bq:0,sd:32] *  M1RP_ID26_RP1   
HSD3B1_chr1_119507798_T/G    42.0% (150) [mq:26,bq:0,sd:25]  M1RP_ID26_RP3   

                            VAF   DP  MQ   sign   ALT  
HSD3B1_chr1_119507798_T/G  29.1  141  38   True  41.0  
HSD3B1_chr1_119507798_T/G  42.0  150  26  False  63.0  
ATM;C11orf65_chr11_108326004_G/T
info        13.8% (58) [mq:42,bq:0,sd:43]
sampleID                    M1RP_ID26_RP1
VAF                                 

In [21]:
print(len(set(patinfo_filt_ID26.index[patinfo_filt_ID26["sampleID"]=="M1RP_ID26_RP1"]).intersection(
    patinfo_filt_ID26.index[patinfo_filt_ID26["sampleID"]=="M1RP_ID26_MLN1"])), 
      len(set(patinfo_filt_ID26.index[patinfo_filt_ID26["sampleID"]=="M1RP_ID26_RP3"]).intersection(
    patinfo_filt_ID26.index[patinfo_filt_ID26["sampleID"]=="M1RP_ID26_MLN1"])))

0 1


In [22]:
patinfo_filt_ID33 = patinfo_filt_ID33[patinfo_filt_ID33["sampleID"].isin(['M1RP_ID33_MLN5', 'M1RP_ID33_RP5',
                                                                         'M1RP_ID33_RP6', 'M1RP_ID33_RP9'])]
for x in patinfo_filt_ID33.index.value_counts().reset_index()["index"]:
    print(patinfo_filt_ID33.loc[x])

                                                     info        sampleID  \
MSH6_chr2_47783976_G/A    34.5% (1717) [mq:41,bq:0,sd:29]  M1RP_ID33_MLN5   
MSH6_chr2_47783976_G/A  20.0% (1621) [mq:41,bq:0,sd:30] *   M1RP_ID33_RP5   
MSH6_chr2_47783976_G/A    39.8% (1620) [mq:40,bq:0,sd:27]   M1RP_ID33_RP6   
MSH6_chr2_47783976_G/A  10.1% (1458) [mq:40,bq:0,sd:32] *   M1RP_ID33_RP9   

                         VAF    DP  MQ   sign    ALT  
MSH6_chr2_47783976_G/A  34.5  1717  41  False  592.0  
MSH6_chr2_47783976_G/A  20.0  1621  41   True  324.0  
MSH6_chr2_47783976_G/A  39.8  1620  40  False  645.0  
MSH6_chr2_47783976_G/A  10.1  1458  40   True  147.0  
                                                      info        sampleID  \
BRCA2_chr13_32332719_T/A  37.6% (433) [mq:34,bq:0,sd:28] *  M1RP_ID33_MLN5   
BRCA2_chr13_32332719_T/A  24.3% (424) [mq:39,bq:0,sd:30] *   M1RP_ID33_RP5   
BRCA2_chr13_32332719_T/A  39.6% (359) [mq:38,bq:0,sd:24] *   M1RP_ID33_RP6   
BRCA2_chr13_32332719_T/A  2

In [23]:
print(len(set(patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_RP5"]).intersection(
    patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_MLN5"])), 
      len(set(patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_RP6"]).intersection(
    patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_MLN5"])),
     len(set(patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_RP9"]).intersection(
    patinfo_filt_ID33.index[patinfo_filt_ID33["sampleID"]=="M1RP_ID33_MLN5"])))

2 3 2


In [None]:
# Merge all filtered patient information into a single DataFrame
targetedDNA = pd.concat([patinfo_filt_ID3, patinfo_filt_ID4, patinfo_filt_ID8, patinfo_filt_ID23, patinfo_filt_ID26, patinfo_filt_ID33])

variant_info = {
    "chrom": [],
    "pos": [], 
    "genes": [],
    "ref": [],
    "alt": []
}

for info in targetedDNA.index:
    if pd.isna(info):
        variant_info["chrom"].append(float('nan'))
        variant_info["pos"].append(float('nan'))
        variant_info["genes"].append(float('nan'))
        variant_info["ref"].append(float('nan'))
        variant_info["alt"].append(float('nan'))

    else:
        gene, chrom, pos, alts = info.split("_")
        ref, alt = alts.split("/")
        variant_info["chrom"].append(chrom[3:])
        variant_info["pos"].append(pos)
        variant_info["genes"].append(gene)
        variant_info["ref"].append(ref)
        variant_info["alt"].append(alt)

targetedDNA["chrom"] = variant_info["chrom"]
targetedDNA["pos"] = variant_info["pos"]
targetedDNA["genes"] = variant_info["genes"]
targetedDNA["ref"] = variant_info["ref"]
targetedDNA["alt"] = variant_info["alt"]

targetedDNA["Patient ID"] = ["M1RP-"+sample_id.split("_")[1] for sample_id in targetedDNA["sampleID"]]
targetedDNA["Sample ID"] = targetedDNA["sampleID"]

In [180]:
columns = ["Patient ID", "Sample ID", "chrom", "pos", "genes", "ref", "alt", "VAF", "DP", "MQ", "ALT"]
targetedDNA[columns].to_csv(os.path.join(PATH_DATA, "tNGS_de_novo_variant_calling.csv"), index=False)