# Using mpathic to compute footprints

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.insert(0, '/home/tom/mpathic')


In [3]:
from mpathic import learn_model

First we import a test data set.

In [4]:
df = pd.read_csv("../../../data/processed_barcodes/20221114_barcode/LB_by_promoter/araCp_counts.csv")
df.head(5)

Unnamed: 0,cDNA_count,gDNA_count,barcode,name,mapping_count,promoter
0,228337,38677,GGCCATACTTTTTGAAGTAT,araCp,39,GTCATGTAGCATCCGCTAATCTTATGGATAAAAATACTATGGCTTA...
1,79171,2461,CATACCTCATGGTGTGCGTA,araCp,3,GTCAGGTAGGATGCGCTAATCTTATGGATAAAAATGCTATGGCATA...
2,62180,23612,GTATATGTACGTCCCTGCGA,araCp,178,GTCAGCTAGGATCCGGTAATCTTATGGATAAAACTGCTCTGGCATA...
3,56594,38141,TAATTCATACTTCGGCGCAG,araCp,56,GTCTGGTAGGATCCGCTAATCTTATGGATATAAATGCTAGGGCATG...
4,55044,36067,CTTGAAAGGCTCTGTGCGTA,araCp,62,GTCATGTAGGAGCCCCTAATCTTAGGGATAAAAATGCTATGGCATA...


Prep dataframe to work with mpathic, therefore renaming columns and sorting the dataframe.

In [5]:
df.rename(
    columns={
        "cDNA_count": "ct_1",
        "gDNA_count": "ct_0",
        "promoter": "seq"
    },
    inplace=True
)
df['ct'] = df['ct_0'] + df['ct_1']
df = df[['ct', 'ct_0', 'ct_1', 'seq']]
df.head(5)

Unnamed: 0,ct,ct_0,ct_1,seq
0,267014,38677,228337,GTCATGTAGCATCCGCTAATCTTATGGATAAAAATACTATGGCTTA...
1,81632,2461,79171,GTCAGGTAGGATGCGCTAATCTTATGGATAAAAATGCTATGGCATA...
2,85792,23612,62180,GTCAGCTAGGATCCGGTAATCTTATGGATAAAACTGCTCTGGCATA...
3,94735,38141,56594,GTCTGGTAGGATCCGCTAATCTTATGGATATAAATGCTAGGGCATG...
4,91111,36067,55044,GTCATGTAGGAGCCCCTAATCTTAGGGATAAAAATGCTATGGCATA...


Run mpathic on dataframe.

In [None]:
db = "../../../data/mpathic_footprints/20221114_barcode/araCp_LB_dataset_db"

mcmc_df = learn_model.main(
    df=df,
    lm='IM',
    modeltype='MAT',
    LS_means_std=None,
    db=db,
    iteration=300000,
    burnin=100000,
    thin=10,
    runnum=0,
    initialize='rand',
    start=0,
    end=None,
    foreground=1,
    background=0,
    alpha=0,
    pseudocounts=1,
    test=False,
    drop_library=False,
    verbose=True,
)


 [----             10%                  ] 32498 of 300000 complete in 4756.5 sec

In [None]:
mcmc_df.to_csv("araC_mcmc_mpathic.csv", index=False)

Transform phenotype output to probabilities using exponential distribution.

In [None]:
inf_arr = mcmc_df[['val_A', 'val_C', 'val_G', 'val_T']].to_numpy()
prob_mat = np.zeros((160, 4))

for i in range(160):
    prob_mat[i, :] = np.exp(inf_arr[i, :]) / np.sum(np.exp(inf_arr[i, :]))

Compute relative entropy.

In [None]:
relative_ent = np.zeros(160)
for i in range(160):
    relative_ent[i] = np.sum([prob_mat[i, j] * np.log2(prob_mat[i, j]/0.25) for j in range(4)])

Plot relative entropy footprint.

In [None]:
fig, ax = plt.subplots(figsize=(15, 3))
ax.bar(np.arange(-115, 45), relative_ent)
ax.set_xlabel('position')
ax.set_ylabel('relative entropy [bits]')
ax.set_title('araC in LB')

In [None]:
fig, ax = plt.subplots(figsize=(15, 3))
ax.bar(np.arange(-115, 45), relative_ent)
ax.set_xlabel('position')
ax.set_ylabel('relative entropy [bits]')
ax.set_title('araC in LB')

## RegSeq dataset

In [None]:
df = pd.read_csv("../../../../RegSeq/data/sequencing_data/LB_data.csv", index_col=0)
df = df.loc[df['gene'] == 'araC', :]
df.rename(
    columns={
        'counts_RNA': 'ct_1',
        'counts_DNA': 'ct_0',
    },
    inplace=True
)
df['ct'] = df['ct_0'] + df['ct_1']
df = df[['ct', 'ct_0', 'ct_1', 'seq']]
df.dropna(inplace=True)
df.head(5)

In [None]:
mcmc_df = learn_model.main(
    df=df,
    lm='IM',
    modeltype='MAT',
    LS_means_std=None,
    db=db,
    iteration=300000,
    burnin=10000,
    thin=10,
    runnum=0,
    initialize='rand',
    start=0,
    end=None,
    foreground=1,
    background=0,
    alpha=0,
    pseudocounts=1,
    test=False,
    drop_library=False,
    verbose=False,
)


In [None]:
mcmc_df.to_csv("ykgE_mcmc_mpathic_regseq.csv")

In [None]:
inf_arr = mcmc_df[['val_A', 'val_C', 'val_G', 'val_T']].to_numpy()
prob_mat = np.zeros((160, 4))

for i in range(160):
    prob_mat[i, :] = np.exp(inf_arr[i, :]) / np.sum(np.exp(inf_arr[i, :]))

In [None]:
relative_ent = np.zeros(160)
for i in range(160):
    relative_ent[i] = np.sum([prob_mat[i, j] * np.log2(prob_mat[i, j]/0.25) for j in range(4)])

In [None]:
fig, ax = plt.subplots(figsize=(15, 3))
ax.bar(np.arange(-115, 45), relative_ent)
ax.set_xlabel('position')
ax.set_ylabel('relative entropy [bits]')
ax.set_title('ykgE in LB')

In [None]:
pd.read_csv("../../../../RegSeq/data/sequencing_data/ykgE_dataset_combined.csv")

In [None]:
!mpathic learn-model --help

In [None]:
z`