# EDA

In [1]:
# Get packages
using CSV, DataFrames, wgregseq, CairoMakie, Statistics, DelimitedFiles, Dates, Colors, FASTX, BioSequences, StatsBase, ProgressMeter

# Set colors and plotting style
wgregseq.plotting_style.default_makie!()
update_theme!(Theme(font = "Lato", titlefont="Lato"))
colors_dict = wgregseq.plotting_style.my_color_dict
colors = [colors_dict["blue1"], colors_dict["orange1"], colors_dict["green1"]]

3-element Vector{String}:
 "#324fa2"
 "#f47c20"
 "#a8cf38"

## Import mapping data

Import the table mapping barcodes to promoter variants and filter out anything that has less than 3 reads, as well as anything with more than 30 mutations.

In [2]:
df_map = CSV.read(
    "../../../data/barcodes/20220514_mapping/mapped_barcodes.csv", 
    DataFrame, 
);

In [3]:
# Filter out unnannotad sequences
df_map = df_map[df_map.name .!= "*", :]

# Filter out non-unique barcodes
gdf = groupby(df_map[(df_map.map_count .> 2), :], :barcode)
_df = DataFrame()
for df in gdf
    if nrow(df) == 1
        append!(_df, df)
    end
end
df_map = copy(_df);

In [4]:
# Get twist order to get wild type sequences
df_seqs = wgregseq.utils.import_twist_order("../../../data/twist_orders/2022-02-15_twist_order.csv")
df_wt = df_seqs[1:1501:119*1501, :];
insertcols!(df_wt, 4, :promoter_seq => [string(x[27:186]) for x in df_wt.sequence])

df_wt.promoter_seq |> unique |> length
df_map = leftjoin(df_map, rename(df_wt[!, [:promoter, :promoter_seq]], :promoter => :name), on="name")
rename!(df_map, :promoter_seq => :wt_seq);

In [7]:
function expression_shifts(df)
    _df = copy(df) 
    # Compute relative (with pseudo counts)
    if :wt_seq ∉ names(_df)
        freq_mat = wgregseq.footprints.frequency_matrix(_df)[1]
        # find wild type sequence 
        wt_seq = argmax(freq_mat, dims=2) |> vec
        wt_seq = map(x -> x[2], wt_seq)
        wt_seq_dna = [wgregseq.footprints.DNA_dict_rev[x] for x in wt_seq]

        function is_mut(x)
            return x .!= wt_seq
        end

        insertcols!(_df, 4, :is_mutated => is_mut.(_df.int_promoter))
    else
        function is_mut(x, y)
            return x .!= y
        end
        insertcols!(_df, 4, :is_mutated => is_mut.(_df.int_promoter, df.int_wt))
    end

    mean_rel_counts = mean(_df.relative_counts)
    a = (_df.relative_counts .- mean_rel_counts) .* _df.is_mutated
    b = wgregseq.utils.onehot_encoder.(_df.promoter)

    ex_shift_arr = zeros(160, 4)
    for i in 1:nrow(_df)
        ex_shift_arr += a[i] .* b[i]
    end

    return ex_shift_arr ./ sum(b, dims=1)[1]
end

expression_shifts (generic function with 1 method)

In [8]:
gc_dict = Dict(
    1 => "M9 + 0.5% Glucose",
    2 => "M9 + 0.5% Xylose",
    3 => "M9 + 0.5% Arabinose",
    5 => "M9 + 0.5% Glucose + 2.5mM Sodium Salicytate",
    6 => "M9 + 0.5% Glucose + 5% Ethanol",
    7 => "M9 + 0.5% Glucose + Stationary Phase (1d)",
    10 => "M9 + 0.5% Glucose + 2mg/L Ampicillin",
    12 => "M9 + 0.5% Glucose + H2O2 (0.1 mM)",
    13 => "LB",
)

Dict{Int64, String} with 9 entries:
  5  => "M9 + 0.5% Glucose + 2.5mM Sodium Salicytate"
  13 => "LB"
  6  => "M9 + 0.5% Glucose + 5% Ethanol"
  7  => "M9 + 0.5% Glucose + Stationary Phase (1d)"
  2  => "M9 + 0.5% Xylose"
  10 => "M9 + 0.5% Glucose + 2mg/L Ampicillin"
  12 => "M9 + 0.5% Glucose + H2O2 (0.1 mM)"
  3  => "M9 + 0.5% Arabinose"
  1  => "M9 + 0.5% Glucose"

In [11]:
# initiate dataframes
df_footprints = DataFrame()
df_exshift = DataFrame()

# date and source for dfs
date = "2023_09_07"
source = "genome"
rep = "1"

ind = collect(Iterators.product(collect(1:4), collect(-115:44))) |> vec

# iterate through growth conditions
for i in [1, 2, 3, 5, 6, 7, 10, 12, 13]
    gc = gc_dict[i]
    # import DNA
    df_DNA = CSV.read(
        "../../../data/extracted_barcodes/20230907_barcode/D$(i)_collapsed.txt", 
        DataFrame, 
        ignorerepeated=true, 
        delim=" ", 
        header=["ct_0", "barcode"]
    )
    # import RNA
    df_RNA = CSV.read(
        "../../../data/extracted_barcodes/20230907_barcode/R$(i)_collapsed.txt", 
        DataFrame, 
        ignorerepeated=true, 
        delim=" ", 
        header=["ct_1", "barcode"]
    )
    
    # merge DNA and RNA reads
    df = outerjoin(df_DNA, df_RNA, on=:barcode)
    
    # replace missing reads with 0
    replace!(df.ct_0, missing => 0)
    replace!(df.ct_1, missing => 0)
    
    # identify promoter sequences
    df = innerjoin(df, df_map, on=:barcode)
    
    # compute total counts
    insertcols!(df, 1, :ct => df.ct_0 .+ df.ct_1)
    insertcols!(df, 1, :relative_counts => (df.ct_1 .+ 1) ./ (df.ct_0 .+ 1))
    
    # Turn sequences into integer
    insertcols!(df, 3, :int_promoter => wgregseq.footprints.make_int.(df[:, :promoter]))
    insertcols!(df, 3, :int_wt => wgregseq.footprints.make_int.(df[:, :wt_seq]));
    for prom in unique(df.name)
        wt_seq = df_wt[df_wt.promoter .== prom, :promoter_seq][1]
        println(wt_seq)
        ind_wt = collect(Iterators.product(collect(1:4), collect(wt_seq))) |> vec
        # get mutual information
        x = wgregseq.footprints.mutual_information_mutation(df[df.name .== prom, :])
        # get expression shift
        ex_shift = expression_shifts(df[df.name .== prom, :])
        ex_shift ./= maximum(abs.(ex_shift))
        # add mutual information to dataframe
        append!(
            df_footprints,
            DataFrame(
                date=date, 
                promoter=prom, 
                source=source, 
                footprint=x,
                growth_condition=gc, 
                replicate=rep, 
                pos=collect(-115:44), 
                d="0")
        )

        append!(
            df_exshift,
            DataFrame(
                date=date, 
                promoter=prom, 
                source=source, 
                expression_shift=ex_shift' |> vec , 
                growth_condition=gc, 
                replicate=rep,
                pos=[t[2] for t in ind],
                base=[t[1] for t in ind],
                wt_base=[t[2] for t in ind_wt]
            )
        )
        for d in 1:2
            append!(
                df_footprints,
                DataFrame(
                    date=date, 
                    promoter=prom, 
                    source=source, 
                    footprint=[mean(x[i-d:i+d]) for i in 1+d:160-d], 
                    growth_condition=gc, 
                    replicate=rep, 
                    pos=collect(-115+d:44-d), 
                    d="$d"
                )
            )
        end
    end
end

GATGACGACAAACTCATTCATCTCATGGCTGCCGTACGAGACTGTGAGTGGTCAGACGATAACGCACTCACCATAAATGTGCAGTTTAATGATTTCCCTGGATTTTATGACTGGATGGATTACCCTGATCACCCGGTTAAGTTTGTTTTTCACATACTTG
TGCTGGACTCATTCGGCATCGGCGCTACAGAAGATGCAGAACGCTTTGGTGACGTCGGGGCTGACACCCTGGGTCATATCGCAGAAGCTTGTGCCAAAGGCGAAGCTGATAACGGTCGTAAAGGCCCGCTCAATCTGCCAAATCTGACCCGTCTGGGGCT
GAACGTGAACATAAAACTGGTTCATTAAGTTTCCTTGCTTGCTGGCTGTGTGAGAACTCCAGCATACCACCGAGCCTGAAGTGGTGAAAAGACAGGCACATAACAGCTAAGTATTTTCAACCAGAGAGAATCCTTAGCGTTGTGGTGAATGCGGCTCAGC
GGTCTGTTCGGCTGTGTCTTCAAATAGACGAACGTGAACATAAAACTGGTTCATTAAGTTTCCTTGCTTGCTGGCTGTGTGAGAACTCCAGCATACCACCGAGCCTGAAGTGGTGAAAAGACAGGCACATAACAGCTAAGTATTTTCAACCAGAGAGAAT
TTCATCGTCAATACGCAGGTTAATGCTACCCATGTCTTACACCTCTTGTAATTACAAATGTCATTACAAGTATCGCACTACAACATGCTTAGGGCAAGTCACGAAGGAAGTCAGAAAGTAGTCGTAAGAACGGTGATCACTGTCCGCTTTGTGCCAGGAG
GTACCTCAGCAGGTGAATAAATTTTATTCATATTGTTATCAACAAGTTATCAAGTATTTTTAATTAAAATGGAAATTGTTTTTGATTTTGCATTTTAAATGAGTAGTCTTAGTTGTGCTGAACGAAAAGAGCACAACGATCCTTCGTTCACAGTGGGGAA
CGAGGATGTGTTGGCGCGTTTCTTGCGCTTCTTG

LoadError: InterruptException:

In [37]:
rename!(df_footprints, :footprint => :mut_info)

Row,date,promoter,source,mut_info,growth_condition,replicate,pos,d
Unnamed: 0_level_1,String,String,String,Float64,String,String,Int64,String
1,2023_09_07,TSS_1414_storz_regulondb,genome,0.00015366,M9 + 0.5% Glucose,1,-115,0
2,2023_09_07,TSS_1414_storz_regulondb,genome,0.000311656,M9 + 0.5% Glucose,1,-114,0
3,2023_09_07,TSS_1414_storz_regulondb,genome,0.000240944,M9 + 0.5% Glucose,1,-113,0
4,2023_09_07,TSS_1414_storz_regulondb,genome,1.37081e-5,M9 + 0.5% Glucose,1,-112,0
5,2023_09_07,TSS_1414_storz_regulondb,genome,1.37036e-6,M9 + 0.5% Glucose,1,-111,0
6,2023_09_07,TSS_1414_storz_regulondb,genome,0.000729594,M9 + 0.5% Glucose,1,-110,0
7,2023_09_07,TSS_1414_storz_regulondb,genome,5.42769e-8,M9 + 0.5% Glucose,1,-109,0
8,2023_09_07,TSS_1414_storz_regulondb,genome,0.000187899,M9 + 0.5% Glucose,1,-108,0
9,2023_09_07,TSS_1414_storz_regulondb,genome,0.000202271,M9 + 0.5% Glucose,1,-107,0
10,2023_09_07,TSS_1414_storz_regulondb,genome,3.84158e-5,M9 + 0.5% Glucose,1,-106,0


In [38]:
insertcols!(df_footprints, 5, :footprint => zeros(nrow(df_footprints)))
insertcols!(df_footprints, 5, :footprint_test => zeros(nrow(df_footprints)))

Row,date,promoter,source,mut_info,footprint_test,footprint,growth_condition,replicate,pos,d
Unnamed: 0_level_1,String,String,String,Float64,Float64,Float64,String,String,Int64,String
1,2023_09_07,TSS_1414_storz_regulondb,genome,0.00015366,0.0,0.0,M9 + 0.5% Glucose,1,-115,0
2,2023_09_07,TSS_1414_storz_regulondb,genome,0.000311656,0.0,0.0,M9 + 0.5% Glucose,1,-114,0
3,2023_09_07,TSS_1414_storz_regulondb,genome,0.000240944,0.0,0.0,M9 + 0.5% Glucose,1,-113,0
4,2023_09_07,TSS_1414_storz_regulondb,genome,1.37081e-5,0.0,0.0,M9 + 0.5% Glucose,1,-112,0
5,2023_09_07,TSS_1414_storz_regulondb,genome,1.37036e-6,0.0,0.0,M9 + 0.5% Glucose,1,-111,0
6,2023_09_07,TSS_1414_storz_regulondb,genome,0.000729594,0.0,0.0,M9 + 0.5% Glucose,1,-110,0
7,2023_09_07,TSS_1414_storz_regulondb,genome,5.42769e-8,0.0,0.0,M9 + 0.5% Glucose,1,-109,0
8,2023_09_07,TSS_1414_storz_regulondb,genome,0.000187899,0.0,0.0,M9 + 0.5% Glucose,1,-108,0
9,2023_09_07,TSS_1414_storz_regulondb,genome,0.000202271,0.0,0.0,M9 + 0.5% Glucose,1,-107,0
10,2023_09_07,TSS_1414_storz_regulondb,genome,3.84158e-5,0.0,0.0,M9 + 0.5% Glucose,1,-106,0


In [14]:
df_exshift[df_exshift.promoter .== "araCp", :]

Row,date,promoter,source,expression_shift,growth_condition,replicate,pos,base,wt_base
Unnamed: 0_level_1,String,String,String,Float64,String,String,Int64,Int64,Char
1,2023_09_07,araCp,genome,-0.0502675,M9 + 0.5% Glucose,1,-115,1,G
2,2023_09_07,araCp,genome,0.278936,M9 + 0.5% Glucose,1,-115,2,G
3,2023_09_07,araCp,genome,0.0,M9 + 0.5% Glucose,1,-115,3,G
4,2023_09_07,araCp,genome,-0.04113,M9 + 0.5% Glucose,1,-115,4,G
5,2023_09_07,araCp,genome,-0.0782516,M9 + 0.5% Glucose,1,-114,1,T
6,2023_09_07,araCp,genome,-0.0421389,M9 + 0.5% Glucose,1,-114,2,T
7,2023_09_07,araCp,genome,0.212821,M9 + 0.5% Glucose,1,-114,3,T
8,2023_09_07,araCp,genome,0.0,M9 + 0.5% Glucose,1,-114,4,T
9,2023_09_07,araCp,genome,0.0568218,M9 + 0.5% Glucose,1,-113,1,C
10,2023_09_07,araCp,genome,0.0,M9 + 0.5% Glucose,1,-113,2,C


In [43]:
CSV.write("../../figures/interactive_footprints/20230907_footprints.csv", df_footprints)
CSV.write("../../figures/interactive_footprints/20230907_exshifts.csv", df_exshift)

"../../figures/interactive_footprints/20230907_exshifts.csv"

## Metadata

In [41]:
df_twist = wgregseq.utils.import_twist_order("../../../data/twist_orders/2022-02-15_twist_order.csv")
unique(df_twist[!, [:genes, :promoter]])

wt_seqs = df_twist[1:1501:end, :]

# Import genome
re = open(FASTA.Reader, "../../../data/ecocyc/mg1655_genome.fasta")
wt_sequence = [sequence(record) for record in re][1]

"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGACGCGTACAGGAAAC" ⋯ 4641115 bytes ⋯ "GATGGTCTATTGCTATCAATTAGCAACATTAATACAACAACCGGCGAAAAGTGATGCAACGGCAGACCAACATCAACTGCAAGCTTTACGCGAACGAGCCATGACATTGCTGACGACTCTGGCAGTGGCAGATGACATAAAACTGGTCGACTGGTTACAACAACGCCTGGGGCTTTTAGAGCAACGAGACACGGCAATGTTGCACCGTTTGCTGCATGATATTGAAAAAAATATCACCAAATAAAAAACGCCTTAGTAAGTATTTTTC"

In [44]:
five_prime = []
three_prime = []
direction = []
promoter = []
genes = []
promoter_seq = []
for i in 1:119
    fwd = findall(string(wt_seqs[i, :sequence][27:186]), string(wt_sequence))
    rev = findall(reverse_complement(string(wt_seqs[i, :sequence][27:186])), string(wt_sequence))
    if length(fwd) == 0 && length(rev) == 0
        throw(ErrorException("Sequence not found for promoter $(wt_seqs[i, :promoter])!"))
    elseif length(fwd) > 0 && length(rev) > 0
        throw(ErrorException("Sequence found on both strands for promoter $(wt_seqs[i, :promoter])!"))
    elseif length(fwd) > 0
        push!(five_prime, fwd[1][1])
        push!(three_prime, fwd[1][end])
        push!(direction, '+')
        push!(promoter, wt_seqs[i, :promoter])
        push!(genes, join(wt_seqs[i, :genes], ", "))
        push!(promoter_seq, wt_seqs[i, :sequence][27:186])
    else
        push!(three_prime, rev[1][1])
        push!(five_prime, rev[1][end])
        push!(direction, '-')
        push!(promoter, wt_seqs[i, :promoter])
        push!(genes, join(wt_seqs[i, :genes], ", "))
        push!(promoter_seq, wt_seqs[i, :sequence][27:186])
    end
end

df_meta = DataFrame(five_prime=five_prime, three_prime=three_prime, direction=direction, promoter=promoter, genes=genes, promoter_seq=promoter_seq)
CSV.write("../../figures/interactive_footprints/20230907_footprints_meta.csv", df_meta)

"../../figures/interactive_footprints/20230907_footprints_meta.csv"