# Mutation Data of Most Similar Samples

Generates the following outputs: 

1. "multiplyMutatedGenesPerMSSOf\_**sample id**.tsv"
2. "multiplyAppearingMutationsPerMSSOf\_**sample id**.tsv"
3. "basicClinAndMutationsPerMSSOf\_**sample id**.tsv"
    

1. Table that lists genes that are mutated more than once (even twice in one sample).
2. Table that lists  mutations that are identical in more than one sample (can't occur twice in one sample).

Format is:


```
MSS_ID  gene1 gene2 gene3
mss1    0     0     0
mss2    0     1     2
...
mss6    0     1     0
```

The count for each mss/gene or mutation pair is how many times that gene or mutation appears in that sample.
For mutations, this will always be 0 or 1.

If there are no mutations / samples at all, the file will simply contain "MSS_ID".
Not all samples will necessarily appear in these files; if a sample does appear, all its entries might be 0.


&#8203;3. This TSV table contains the columns:
Sample ID Disease Type Age at Dx (Years) Mutations in Cancer Genes

Mutations column is taken word-for-word from TCGA_NonSilentMutationsInCancerGenesBySample.txt .

All MSS of the sample will appear whether there is any data for them or not.

JSON Output format:
j["mss_multi_genes"] :
For each gene, lists each sample and the count of mutations for the gene in that sample.
Omits samples that were not listed in the mutation reference file.

`{ Gene: { sample1 : 0, sample2: 2, sample3: 1}, Gene: ... }`

j["mss_multi_mutations"]:
For each mutation, lists each sample, with 1 if the sample had that mutation and 0 if it did not. Omits samples that were not listed in the mutation reference file.

`{ Mutation: { sample1 : 0, sample2: 1, sample3: 1}, Mutation: ... }`

#### Inputs
Depends on steps:
* 2.0
    - json tumormap_results

In [None]:
import json
import logging
import os
import csv
import pandas as pd
import collections # for Counter


# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n2.6: Most Similar Sample Mutation Data")
def and_log(s):
    logging.info(s)
    return s

j = {}

In [None]:
# Load the most similar samples (mss)

with open(c["json"]["2.0"],"r") as jf:
        mss = set(json.load(jf)["tumormap_results"].keys())

# Drop any self-sample appearing in neighbors
mss.discard(c["info"]["id_for_tumormap"])

print "Most similar samples for {}:".format(sample_id)
print "\n".join(mss)

In [None]:
# A mutation is one of the following:
# "noNonSilentMutations"
# GENE_p.LOCATION
# GENE_OTHER_MUTATION_STRING
# A gene may have multiple mutations, except noNonSilentMutations may only appear alone

def split_mutation(mut):
    if(mut == "noNonSilentMutations"):
        return {}
    (gene, mutation) = mut.split("_", 1)
    return { "gene":gene, "mutation":mutation}

def join_mutation(mut_dict):
    if(mut_dict == {}):
        return "noNonSilentMutations"
    return "_".join([mut_dict["gene"], mut_dict["mutation"]])

In [None]:
# Use this to test that the mutation list parses. Will not run in normal execution of the script. Prints much output!
if(False):
    sample_mutations = {}
    mutation_file = c["ref_file"]["TCGA_non_silent_cancer_mutations_by_sample"]
    with open(mutation_file, "r") as f:
        reader = csv.DictReader(f, dialect='excel-tab')
        for row in reader:
            sample = row["sample"]
            muts = row["geneAndProtein_Change"].split(", ")
            sample_mutations[sample] = map(split_mutation, muts)

    for s in sample_mutations.keys():
        print s
        print ", ".join(map(lambda x: join_mutation(x), sample_mutations[s]))

In [None]:
# Load the reference file
mutation_file = c["ref_file"]["TCGA_non_silent_cancer_mutations_by_sample"]
print("Using the following mutation list:")
!md5sum $mutation_file

sample_mutations = {}

# Add all MSS samples' mutations to the mutations list
# format: { sample_id : [{gene:GENE, mutation:MUTATION}]}
with open(mutation_file, "r") as f:
    reader = csv.DictReader(f, dialect='excel-tab')
    for row in reader:
        sample = row["sample"]
        # V3 only -- add ckcc_reference prefix to match v3 sample id format
        if(c["tumormap"]["info"]["name"] == "v3"):
            sample = "ckcc_reference/{}".format(sample)
        muts = row["geneAndProtein_Change"].split(", ")
        if sample in mss:
            print "Found mutation entry for {}".format(sample)
            sample_mutations[sample] = map(split_mutation, muts)


In [None]:
if(not sample_mutations):
    print and_log("None of the MSS for this sample appeared in the mutation list.")

In [None]:
# generate 3. "basicClinAndMutationsPerMSSOf\_**sample id**.tsv"
# This TSV table contains the columns:
# Sample ID  | Disease Type | Age at Dx (Years) | Mutations in Cancer Genes

# open clinical essential file & get our samples & desired columns
# since this is for the MSS, use the tumormap version instead of outlier
clin_essential = pd.read_csv(c["tumormap"]["essential_clinical"], sep="\t",index_col="th_sampleid", dtype="str")
# Add a blank age_at_dx column if not present
if not "age_at_dx" in clin_essential.columns:
    clin_essential["age_at_dx"] = ""

selected_clin = clin_essential[clin_essential.index.isin(mss)][["disease","age_at_dx"]].copy()
# Add rows for any that are missing from the clinical file
selected_clin = selected_clin.reindex(pd.Index(mss).union(selected_clin.index))
# Add the mutation column
mutations_for_printing = {}
for k, v in sample_mutations.iteritems():
    mutations_for_printing[k] = ", ".join(map(lambda x: join_mutation(x), v))
selected_clin["Mutations in Cancer Genes"] = pd.Series(mutations_for_printing)
# Rename columns per spec
selected_clin.rename(columns={
    "disease":"Disease Type",
    "age_at_dx":"Age at Dx (Years)"
    },inplace=True)

print("Basic Clinical Data and Mutations")
selected_clin

In [None]:
# Then, calculate the multi-gene and multi-mutation files 

# Check for genes and mutations that appear more than once
all_genes = collections.Counter() # { gene : overall count of that gene }
all_mutations = collections.Counter()

for k,v in sample_mutations.iteritems():
    if v == [{}]: # Mutation was noNonSilentMutations , never appears w other mutations
        continue
    all_genes.update(map(lambda x: x["gene"], v))
    all_mutations.update(map(join_mutation, v))
    
# Fetch genes & mutations that appear more than once and
# set their count to 0 to use as a template for counting per-sample appearances
multiply_appearing_genes = {k:0 for k,v in all_genes.iteritems() if v >= 2}
multiply_appearing_mutations = {k:0 for k,v in all_mutations.iteritems() if v >= 2}

In [None]:
print("Genes appearing more than once:\n")
print "\n".join(multiply_appearing_genes.keys())
print("\nMutations appearing more than once:\n")
print "\n".join(multiply_appearing_mutations.keys())

In [None]:
# Then, construct the table of samples to multigenes, getting for each sample
# the count of appearances of multigenes in that sample
multiplyMutatedGenesPerMSS = {}  # sample : { gene1: 1, gene2: 0, gene3:0, gene4: 2}
multiplyAppearingMutationsPerMSS = {}

for s, mutations in sample_mutations.iteritems():
    multiplyMutatedGenesPerMSS[s] = multiply_appearing_genes.copy()
    multiplyAppearingMutationsPerMSS[s] = multiply_appearing_mutations.copy()
    if mutations == [{}]: # Mutation was noNonSilentMutations 
        continue
    for mutation in mutations:
        if(mutation["gene"] in multiply_appearing_genes):
            multiplyMutatedGenesPerMSS[s][mutation["gene"]] += 1
        if join_mutation(mutation) in multiply_appearing_mutations:
            multiplyAppearingMutationsPerMSS[s][join_mutation(mutation)] += 1


In [None]:
# Translate genes and mutations dicts to data frames
genes_df = pd.DataFrame.from_dict(data=multiplyMutatedGenesPerMSS, orient="index")
mutations_df = pd.DataFrame.from_dict(data=multiplyAppearingMutationsPerMSS, orient="index")
genes_df.index.name=mutations_df.index.name="MSS_ID"

# If they're empty, set the index to "none found" so the file will contain that instead of just "MSS_ID"
if(mutations_df.shape == (0,0)):
    mutations_df.index.name = "none found"
if(genes_df.shape == (0,0)):
    genes_df.index.name = "none found"

In [None]:
print("Multiply Mutated Genes per Most Similar Sample")
genes_df

In [None]:
print("Mutations Appearing More than Once in Most Similar Samples")
mutations_df

In [None]:
if (len(genes_df.index) == 0 ) and (len(mutations_df.index) == 0):
    print and_log("No multiply-appearing mutations or genes were found!")

In [None]:
# Save json
j["mss_clin_and_mutations"] = json.loads(selected_clin.to_json(orient='columns'))
j["mss_multi_genes"]=json.loads(genes_df.to_json(orient='columns'))
j["mss_multi_mutations"]=json.loads(mutations_df.to_json(orient='columns'))

with open(c["json"]["2.6"], "w") as jf:
    json.dump(j, jf, indent=2)

In [None]:
print("Done!")