# Mutation Differentials
This notebook analyzes how mutations change in prevalance within phenotype space.

In [1]:
%matplotlib inline

import microbepy_init
import microbepy
import microbepy.common
from microbepy.common import constants as cn
from microbepy.common import util
from microbepy.common import isolate
from microbepy.statistics.mutation_differential import MutationDifferential
from microbepy.common.range_constraint import RangeConstraint
from microbepy.common.study_context import nextStudyContext
from microbepy.plot.util_plot import PlotParms
from microbepy.correlation.mutation_collection import MutationCollection
from microbepy.plot.mutation_plot import MutationIsolatePlot, MutationLinePlot

import copy
import numpy as np
import pandas as pd

## Basic Mutation Analysis

### Isolate Mutations

In [None]:
# Mutations for isolates

mutation_plot = MutationIsolatePlot(mutation_column=cn.GGENE_ID)
mutation_plot.plot(cn.SPECIES_MIX_DVH)
mutation_plot.plot(cn.SPECIES_MIX_MMP)

### Line Mutations

In [None]:
parms = PlotParms()
parms[cn.PLT_FIGSIZE] = [20, 16]
mutation_plot = MutationLinePlot(mutation_column=cn.GGENE_ID)
mutation_plot.plotTransfers(parms=parms)

Observations
1. There are several lines that have identical initial conditions, suggesting that comparisons can be made in erms of evolutionary outcomes.
1. Can try to manually correlate MMP and DVH mutations.For example, is MMP0419 a response to DVH first intergeneic mutation?
1. Plot number of multi-line mutations over time for DVH and MMP. Consider thresholds of at least 2 lines and at least 3 lines.
1. Why are there more lines for MMP than DVH?
1. Can the presence of a mutation in later generations be predicted based on earlier generations?

In [None]:
# Mutation fractions by line

In [None]:
# Compare mutations of isolates with standardized rate < 0 with the mutations in isolates with rate > 0 in line
def makeDFForLine(line, depvar=cn.RATE, separation=0.0, is_median=False):
    rc_low = RangeConstraint(lower=-10, upper=-separation/2.0)
    rc_high = RangeConstraint(lower=separation/2.0, upper=10)
    if line == cn.LINE_ALL:
        constraints = []
    else:
        constraints = [lambda r: r[cn.LINE] == line]
    differential = MutationDifferential(depvar, cn.GGENE_ID,
        constraints=constraints,
        is_median=is_median, is_standardize_by_line=True,
        rc_low=rc_low, rc_high=rc_high)
    return differential.makeDF()

In [None]:
# count1: count of occurrences in values of low phenotype
# count2: count in high phenotype
# value: cumulative significance level
for line in cn.LINE_CIS:
    print("\n\n%s\n" % line)
    print (makeDFForLine(line, separation=0, is_median=True))

In [None]:
for line in cn.LINE_CIS:
    print("\n\n%s\n" % line)
    print (makeDFForLine(line, depvar=cn.YIELD, separation=0, is_median=False))

In [None]:
for depvar in cn.DEPVARS:
    line = cn.LINE_ALL
    print("\n\n%s\n" % depvar)
    print (makeDFForLine(line, depvar=depvar, separation=0, is_median=True))

Observations
1. Note that larger values of rate phenotype seem to be correlated with EPD (from database queries) for UE3

## To Do
1. Summarize significant results in tables with protein descriptions
1. Plot results in phenotype space
   1. UE3 rate - 3 significant
   1. HR2 yield - 2 significant
1. Research the proteins

# Knob Analysis
1. can see all non-universal mutations by using max_sl = 0.99
1. do plots at 0.05 and 0.99 for rate, yield, average & median. Show contrast between separation provided by small and large significance levels.
1. some seem that they should be significant based on the counts

In [None]:
def plotKnobSL(max_sl, is_legend=True):
    specification = {cn.DEPVAR: cn.DEPVARS, cn.LINE: cn.LINE_CIS}
    for context in nextStudyContext(specification=specification):
        constraints = [lambda r: r[cn.LINE] == context.line]
        differential = MutationDifferential(context.depvar, cn.GGENE_ID,
            constraints=constraints,
            is_median=True,
            is_standardize_by_line = True,
            )   
        parms = PlotParms()
        if not is_legend:
            parms[cn.PLT_LEGEND] = ""
        parms[cn.PLT_TITLE] = "%s, %s" % (context.line, context.depvar)
        differential.scatterKnob(parms=parms, is_plot=True, max_sl=max_sl)

In [None]:
plotKnobSL(0.05)

In [None]:
plotKnobSL(0.95, is_legend=False)

# Mutation Groups
Find mutations that only occur in an isolate if the entire group is present

In [None]:
# Plot mutation groups
parms = PlotParms()
parms[cn.PLT_FIGSIZE] = (12, 8)
for species in [cn.SPECIES_MIX_DVH, cn.SPECIES_MIX_MMP]:
    collection = MutationCollection.makeMutationCollectionForLine(species=species)
    collection.plot(parms=parms)