In [179]:
using CSV, DataFrames, wgregseq, CairoMakie, Statistics, DelimitedFiles, Dates, Colors, FASTX, BioSequences, StatsBase

wgregseq.plotting_style.default_makie!()
update_theme!(Theme(font = "Lato", titlefont="Lato"))
colors_dict = wgregseq.plotting_style.my_color_dict
colors = [colors_dict["blue1"], colors_dict["orange1"], colors_dict["green1"]]



3-element Vector{String}:
 "#324fa2"
 "#f47c20"
 "#a8cf38"

## Import mapping data

Import the table mapping barcodes to promoter variants and filter out anything that has less than 3 reads, as well as anything with more than 30 mutations.

In [2]:
df_map = CSV.read(
    "../../../data/barcodes/20220514_mapping/mapped_barcodes.csv", 
    DataFrame, 
);


In [3]:

df_map = df_map[df_map.name .!= "*", :]

gdf = groupby(df_map[(df_map.map_count .> 2), :], :barcode)
_df = DataFrame()
for df in gdf
    if nrow(df) == 1
        append!(_df, df)
    end
end
df_map = copy(_df);

In [4]:
df_seqs = wgregseq.utils.import_twist_order("../../../data/twist_orders/2022-02-15_twist_order.csv")
df_wt = df_seqs[1:1501:119*1501, :];
insertcols!(df_wt, 4, :promoter_seq => [string(x[27:186]) for x in df_wt.sequence])

df_wt.promoter_seq |> unique |> length

112

In [35]:
gc_dict = Dict("LB" => "LB", "M9-Xy-Ar"=>"Minimal Media + 0.5% Xylose + 0.5% Arabinose")
df_store = DataFrame()
date = "2023_05_25"
source = "plasmid"

for gc in ["LB", "M9-Xy-Ar"]
    for rep in [1, 2]
        df_DNA = CSV.read("../../../data/extracted_barcodes/20230525_barcode/$(gc)-$(rep)-DNA_collapsed.txt", DataFrame, ignorerepeated=true, delim=" ", header=["ct_0", "barcode"])
        df_RNA = CSV.read("../../../data/extracted_barcodes/20230525_barcode/$(gc)-$(rep)-RNA_collapsed.txt", DataFrame, ignorerepeated=true, delim=" ", header=["ct_1", "barcode"])

        df = outerjoin(df_DNA, df_RNA, on=:barcode)
        replace!(df.ct_0, missing => 0)
        replace!(df.ct_1, missing => 0)

        df = innerjoin(df, df_map, on=:barcode)
        insertcols!(df, 1, :ct => df.ct_0 .+ df.ct_1)
        for prom in df.name |> unique
            x = wgregseq.footprints.mutual_information_mutation(df[df.name .== prom, :])
            append!(
                df_store,
                DataFrame(date=date, promoter=prom, source=source, footprint=x, growth_condition=gc_dict[gc], replicate=rep, pos=collect(-115:44), d="0")
            )
            for d in 1:2
                append!(
                    df_store,
                    DataFrame(
                        date=date, 
                        promoter=prom, 
                        source=source, 
                        footprint=[mean(x[i-d:i+d]) for i in 1+d:160-d], 
                        growth_condition=gc_dict[gc], 
                        replicate=rep, 
                        pos=collect(-115+d:44-d), 
                        d="$d"
                    )
                )
            end
        end
    end
end


In [36]:
rename!(df_store, :footprint => :mut_info)

Row,date,promoter,source,mut_info,growth_condition,replicate,pos,d
Unnamed: 0_level_1,String,String,String,Float64,String,Int64,Int64,String
1,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.09873e-5,LB,1,-115,0
2,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.40207e-5,LB,1,-114,0
3,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.19716e-5,LB,1,-113,0
4,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.1791e-5,LB,1,-112,0
5,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.16899e-5,LB,1,-111,0
6,2023_05_25,TSS_1414_storz_regulondb,plasmid,4.85577e-5,LB,1,-110,0
7,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.45007e-6,LB,1,-109,0
8,2023_05_25,TSS_1414_storz_regulondb,plasmid,0.000179566,LB,1,-108,0
9,2023_05_25,TSS_1414_storz_regulondb,plasmid,0.000186077,LB,1,-107,0
10,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.99843e-5,LB,1,-106,0


In [145]:
gc_dict = Dict("LB" => "LB", "M9XyAr"=>"Minimal Media + 0.5% Xylose + 0.5% Arabinose")

df_store_2 = DataFrame()
df_store_2_test = DataFrame()
for file in readdir("footprints")
    if split(file, '.')[end] == "csv"
        mcmc = CSV.read("footprints/" * file, DataFrame)[!, ["val_A", "val_C", "val_G", "val_T"]] |> Matrix
        prob_mat = exp.(mcmc) ./ sum(exp.(mcmc), dims=2)
        x = sum(prob_mat .* log2.(prob_mat ./ 0.25), dims=2) |> vec
        date = "2023_05_25"
        promoter = join(split(file, '_')[3:end-2], '_')

        source = "plasmid"
        gc = gc_dict[join(split(split(file, '_')[1], '-')[1:end-1])]
        replicate = parse(Int64, split(split(file, '_')[1], '-')[end])

        mut_info = df_store[
            (df_store.source .== source) .&
            (df_store.date .== date) .&
            (df_store.growth_condition .== gc) .&
            (df_store.promoter .== promoter) .&
            (df_store.replicate .== replicate) .&
            (df_store.d .== "0"), :mut_info
            ]

        if occursin("test", file)
            #promoter = promoter  * "_test"
            append!(df_store_2_test,
            DataFrame(date=date, promoter=promoter, source=source, footprint_test=x, growth_condition=gc, replicate=replicate, pos=collect(-115:44), d="0")
        )
        else
            append!(df_store_2,
                DataFrame(date=date, promoter=promoter, source=source, footprint=x, growth_condition=gc, replicate=replicate, pos=collect(-115:44), d="0")
            )
        end
        for d in 1:2
            if occursin("test", file)
                append!(
                    df_store_2_test,
                    DataFrame(
                        date=date, 
                        promoter=promoter, 
                        source=source, 
                        footprint_test=[mean(x[i-d:i+d]) for i in 1+d:160-d], 
                        growth_condition=gc, 
                        replicate=replicate, 
                        pos=collect(-115+d:44-d), 
                        d="$d",
                    )
                )
            else
                append!(
                    df_store_2,
                    DataFrame(
                        date=date, 
                        promoter=promoter, 
                        source=source, 
                        footprint=[mean(x[i-d:i+d]) for i in 1+d:160-d], 
                        growth_condition=gc, 
                        replicate=replicate, 
                        pos=collect(-115+d:44-d), 
                        d="$d",
                    )
                )
            end
        end
    end
end

df_store_2_test

Row,date,promoter,source,footprint_test,growth_condition,replicate,pos,d
Unnamed: 0_level_1,String,String,String,Float64,String,Int64,Int64,String
1,2023_05_25,TSS_1414_storz_regulondb,plasmid,6.46353e-7,LB,1,-115,0
2,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.45443e-6,LB,1,-114,0
3,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.53195e-6,LB,1,-113,0
4,2023_05_25,TSS_1414_storz_regulondb,plasmid,4.05344e-6,LB,1,-112,0
5,2023_05_25,TSS_1414_storz_regulondb,plasmid,6.5103e-6,LB,1,-111,0
6,2023_05_25,TSS_1414_storz_regulondb,plasmid,4.59973e-6,LB,1,-110,0
7,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.68526e-6,LB,1,-109,0
8,2023_05_25,TSS_1414_storz_regulondb,plasmid,3.97764e-6,LB,1,-108,0
9,2023_05_25,TSS_1414_storz_regulondb,plasmid,5.66175e-6,LB,1,-107,0
10,2023_05_25,TSS_1414_storz_regulondb,plasmid,5.97017e-7,LB,1,-106,0


In [146]:
df_fin = outerjoin(df_store, df_store_2, df_store_2_test, on=[:date, :promoter, :source, :growth_condition, :replicate, :pos, :d])
replace!(df_fin.mut_info, missing => 0)
replace!(df_fin.footprint, missing => 0)
replace!(df_fin.footprint_test, missing => 0)

213300-element Vector{Union{Missing, Float64}}:
 6.463526298829017e-7
 6.463526298829017e-7
 1.4544335836654616e-6
 1.4544335836654616e-6
 2.5319471740252336e-6
 2.5319471740252336e-6
 4.053444842960167e-6
 4.053444842960167e-6
 6.510296146148772e-6
 6.510296146148772e-6
 4.599729153591985e-6
 4.599729153591985e-6
 1.685257348487323e-6
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [149]:
CSV.write("../../figures/20230525_footprints.csv", df_fin)

"../../figures/20230525_footprints.csv"

In [147]:
df_fin

Row,date,promoter,source,mut_info,growth_condition,replicate,pos,d,footprint,footprint_test
Unnamed: 0_level_1,String,String,String,Float64?,String,Int64,Int64,String,Float64?,Float64?
1,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.09873e-5,LB,1,-115,0,9.86126e-7,6.46353e-7
2,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.09873e-5,LB,1,-115,0,1.52265e-5,6.46353e-7
3,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.40207e-5,LB,1,-114,0,3.87399e-7,1.45443e-6
4,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.40207e-5,LB,1,-114,0,0.000107145,1.45443e-6
5,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.19716e-5,LB,1,-113,0,3.83811e-8,2.53195e-6
6,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.19716e-5,LB,1,-113,0,7.93982e-6,2.53195e-6
7,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.1791e-5,LB,1,-112,0,4.93489e-7,4.05344e-6
8,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.1791e-5,LB,1,-112,0,6.05985e-6,4.05344e-6
9,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.16899e-5,LB,1,-111,0,6.2729e-7,6.5103e-6
10,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.16899e-5,LB,1,-111,0,3.69685e-6,6.5103e-6


In [148]:
df_fin

Row,date,promoter,source,mut_info,growth_condition,replicate,pos,d,footprint,footprint_test
Unnamed: 0_level_1,String,String,String,Float64?,String,Int64,Int64,String,Float64?,Float64?
1,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.09873e-5,LB,1,-115,0,9.86126e-7,6.46353e-7
2,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.09873e-5,LB,1,-115,0,1.52265e-5,6.46353e-7
3,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.40207e-5,LB,1,-114,0,3.87399e-7,1.45443e-6
4,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.40207e-5,LB,1,-114,0,0.000107145,1.45443e-6
5,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.19716e-5,LB,1,-113,0,3.83811e-8,2.53195e-6
6,2023_05_25,TSS_1414_storz_regulondb,plasmid,7.19716e-5,LB,1,-113,0,7.93982e-6,2.53195e-6
7,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.1791e-5,LB,1,-112,0,4.93489e-7,4.05344e-6
8,2023_05_25,TSS_1414_storz_regulondb,plasmid,2.1791e-5,LB,1,-112,0,6.05985e-6,4.05344e-6
9,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.16899e-5,LB,1,-111,0,6.2729e-7,6.5103e-6
10,2023_05_25,TSS_1414_storz_regulondb,plasmid,1.16899e-5,LB,1,-111,0,3.69685e-6,6.5103e-6


In [151]:
df_fin[df_fin.promoter .== "ompRp3", :]

Row,date,promoter,source,mut_info,growth_condition,replicate,pos,d,footprint,footprint_test
Unnamed: 0_level_1,String,String,String,Float64?,String,Int64,Int64,String,Float64?,Float64?
1,2023_05_25,ompRp3,plasmid,3.20638e-5,LB,1,-115,0,3.08372e-7,0.0
2,2023_05_25,ompRp3,plasmid,9.64902e-5,LB,1,-114,0,1.67331e-7,0.0
3,2023_05_25,ompRp3,plasmid,0.000248037,LB,1,-113,0,7.05064e-8,0.0
4,2023_05_25,ompRp3,plasmid,9.40705e-5,LB,1,-112,0,2.19596e-7,0.0
5,2023_05_25,ompRp3,plasmid,4.18643e-5,LB,1,-111,0,1.51027e-7,0.0
6,2023_05_25,ompRp3,plasmid,0.000155394,LB,1,-110,0,2.59579e-7,0.0
7,2023_05_25,ompRp3,plasmid,2.60541e-5,LB,1,-109,0,9.62142e-8,0.0
8,2023_05_25,ompRp3,plasmid,8.28819e-5,LB,1,-108,0,2.24269e-7,0.0
9,2023_05_25,ompRp3,plasmid,8.7656e-6,LB,1,-107,0,1.30273e-7,0.0
10,2023_05_25,ompRp3,plasmid,2.52449e-6,LB,1,-106,0,1.22126e-7,0.0


In [157]:
df_twist = wgregseq.utils.import_twist_order("../../../data/twist_orders/2022-02-15_twist_order.csv")
unique(df_twist[!, [:genes, :promoter]])

wt_seqs = df_twist[1:1501:end, :]

# Import genome
re = open(FASTA.Reader, "../../../data/ecocyc/mg1655_genome.fasta")
wt_sequence = [sequence(record) for record in re][1]

"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGACGCGTACAGGAAAC" ⋯ 4641115 bytes ⋯ "GATGGTCTATTGCTATCAATTAGCAACATTAATACAACAACCGGCGAAAAGTGATGCAACGGCAGACCAACATCAACTGCAAGCTTTACGCGAACGAGCCATGACATTGCTGACGACTCTGGCAGTGGCAGATGACATAAAACTGGTCGACTGGTTACAACAACGCCTGGGGCTTTTAGAGCAACGAGACACGGCAATGTTGCACCGTTTGCTGCATGATATTGAAAAAAATATCACCAAATAAAAAACGCCTTAGTAAGTATTTTTC"

In [175]:
occursin(string(wt_sequence), string(wt_seqs[1, :sequence][27:186]))

false

In [182]:
occursin(reverse_complement(string(wt_seqs[1, :sequence][27:186])), string(wt_sequence))

true

In [185]:
findall(reverse_complement(string(wt_seqs[1, :sequence][27:186])), string(wt_sequence)) |> length

1

In [191]:
Start = []
End = []
direction = []
promoter = []
for i in 1:119
    fwd = findall(string(wt_seqs[i, :sequence][27:186]), string(wt_sequence))
    rev = findall(reverse_complement(string(wt_seqs[i, :sequence][27:186])), string(wt_sequence))
    if length(fwd) == 0 && length(rev) == 0
        throw(ErrorException("Sequence not found for promoter $(wt_seqs[i, :promoter])!"))
    elseif length(fwd) > 0 && length(rev) > 0
        throw(ErrorException("Sequence found on both strands for promoter $(wt_seqs[i, :promoter])!"))
    elseif length(fwd) > 0
        push!(Start, fwd[1][1])
        push!(End, fwd[1][end])
        push!(direction, '+')
        push!(promoter, wt_seqs[i, :promoter])
    else
        push!(Start, rev[1][1])
        push!(End, rev[1][end])
        push!(direction, '-')
        push!(promoter, wt_seqs[i, :promoter])
    end
end

df_meta = DataFrame(location=location, direction=direction, promoter=promoter)

Row,location,direction,promoter
Unnamed: 0_level_1,Any,Any,Any
1,1655126:1655285,-,rspAp
2,70030:70189,-,araBp
3,1942520:1942679,+,znuCp
4,3730762:3730921,-,xylAp
5,3730955:3731114,+,xylFp
6,1647831:1647990,-,dicCp
7,1645858:1646017,-,relBp
8,932815:932974,+,ftsKp2
9,933024:933183,+,ftsKp1
10,367493:367652,-,lacIp


In [174]:
[string(wt_seqs[i, :sequence][26:187]) for i in 1:20]

20-element Vector{String}:
 "TTTTCATCTTTTGTCAACCATTCACAGCGCA" ⋯ 101 bytes ⋯ "CAAGGAATCGAACATGAAGATCGTAAAGGG"
 "TTCCACATTGATTATTTGCACGGCGTCACAC" ⋯ 101 bytes ⋯ "GAGTGAAACGATGGCGATTGCAATTGGCCG"
 "TGTGTTGCACCTCCCCAGAGAGCGGCGGATA" ⋯ 101 bytes ⋯ "CTAAAATTAACATGACAAGTCTGGTTTCCG"
 "TCCAGTTTCATCATTCCATTTTATTTTGCGA" ⋯ 101 bytes ⋯ "GCATTACCTGATTATGGAGTTCAATATGCG"
 "TAATTGCGCAACAAAAGTAAGATCTCGGTCA" ⋯ 101 bytes ⋯ "TGTTACTTATTAAAACTGTCCTCTAACTAG"
 "TGAGGTTTTTCCGACGATACCTGATGCGTTC" ⋯ 101 bytes ⋯ "GTGTTTTAAATGCTTAAAACTGACGCTCTG"
 "TTGGACTTAGCAATGGCTGCTCCTGGCACAA" ⋯ 101 bytes ⋯ "AAGAGGTGTAAGACATGGGTAGCATTAACG"
 "TTTCTGGTATTCGTTGAGATTACTCTGAATC" ⋯ 101 bytes ⋯ "AAAACACGCGTGCCGGATATGTCAGCCTAG"
 "TACACGGACATACGTTGTTATGGAAGAAGTC" ⋯ 101 bytes ⋯ "TGGGGTAACCTGGTACTGTTGTCCGTTTTG"
 "TGAAGAAGGGGTTGAATCGCAGGCTATTCTG" ⋯ 101 bytes ⋯ "GGGTGGTGAATGTGAAACCAGTAACGTTAG"
 "TGCAGGCTGGATGTCAGGGTGTTGTATTGCC" ⋯ 101 bytes ⋯ "TATAGGAAGGGTGTTTTCGGCTACAATCAG"
 "TGGTTAACAGCAGGCTGGATGTCAGGGTGTT" ⋯ 101 bytes ⋯ "GCGCTCAGTATAGGAAGGGTG