In [12]:
using wgregseq, CSV, DataFrames, CairoMakie, SparseArrays

wgregseq.plotting_style.default_makie!()

In [8]:
df_map = CSV.read(
    "../../../data/barcodes/20220514_mapping/mapped_barcodes.csv", 
    DataFrame, 
);

# Filter out unnannotad sequences
df_map = df_map[df_map.name .!= "*", :]

# Filter out non-unique barcodes
gdf = groupby(df_map[(df_map.map_count .> 2), :], :barcode)
_df = DataFrame()
for df in gdf
    if nrow(df) == 1
        append!(_df, df)
    end
end
df_map = copy(_df);

# Get twist order to get wild type sequences
df_seqs = wgregseq.utils.import_twist_order("../../../data/twist_orders/2022-02-15_twist_order.csv")
df_wt = df_seqs[1:1501:119*1501, :];
insertcols!(df_wt, 4, :promoter_seq => [string(x[27:186]) for x in df_wt.sequence])

df_wt.promoter_seq |> unique |> length
df_map = leftjoin(df_map, rename(df_wt[!, [:promoter, :promoter_seq]], :promoter => :name), on="name")
rename!(df_map, :promoter_seq => :wt_seq);

In [9]:
function get_dataset(i)
    df_DNA = CSV.read(
        "../../../data/extracted_barcodes/20230907_barcode/D$(i)_collapsed.txt", 
        DataFrame, 
        ignorerepeated=true, 
        delim=" ", 
        header=["ct_0", "barcode"]
    )
    # import RNA
    df_RNA = CSV.read(
        "../../../data/extracted_barcodes/20230907_barcode/R$(i)_collapsed.txt", 
        DataFrame, 
        ignorerepeated=true, 
        delim=" ", 
        header=["ct_1", "barcode"]
    )
    
    # merge DNA and RNA reads
    df = outerjoin(df_DNA, df_RNA, on=:barcode)
    
    # replace missing reads with 0
    replace!(df.ct_0, missing => 0)
    replace!(df.ct_1, missing => 0)
    
    # identify promoter sequences
    df = innerjoin(df, df_map, on=:barcode)
    
    # compute total counts
    insertcols!(df, 1, :ct => df.ct_0 .+ df.ct_1)
    insertcols!(df, 1, :relative_counts => (df.ct_1 .+ 1) ./ (df.ct_0 .+ 1))
    
    # Turn sequences into integer
    insertcols!(df, 3, :int_promoter => wgregseq.footprints.make_int.(df[:, :promoter]))
    insertcols!(df, 3, :int_wt => wgregseq.footprints.make_int.(df[:, :wt_seq]));
    return df
end

get_dataset (generic function with 1 method)

In [10]:
df = get_dataset(3);
df = df[df.name .== "araBp", :];

In [14]:
seq_vec_0 = vcat([fill(i, df.ct_0[i]) for i in 1:nrow(df)]...)
mu0 = fill(0, length(seq_vec_0))

seq_vec_1 = vcat([fill(i, df.ct_1[i]) for i in 1:nrow(df)]...)
mu1 = fill(1, length(seq_vec_1))

seq_vec = vcat(seq_vec_0, seq_vec_1)
mu = vcat(mu0, mu1)
mu = log10.((df.ct_1 .+ 1) ./ (df.ct_0 .+ 1))
seq_mat = sparse(vcat([vcat(wgregseq.utils.onehot_encoder.(seq)'...)' for seq in df.promoter]...));

In [17]:
wgregseq.footprints.run_mcmc(seq_mat, mu, warmup_steps=10000, iter_steps=10000)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m started timer at: 2023-09-19T09:33:40.242


1000 of 20000 done.
2         1         

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m         5.735163291s: 5 seconds, 735 milliseconds
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        65.698656375s: 1 minute, 5 seconds, 698 milliseconds


2000 of 20000 done.
2         1         

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        11.403397083s: 11 seconds, 403 milliseconds
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        71.366890167s: 1 minute, 11 seconds, 366 milliseconds


3000 of 20000 done.
2         1         

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        16.871826958s: 16 seconds, 871 milliseconds
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        76.835320042s: 1 minute, 16 seconds, 835 milliseconds


4000 of 20000 done.
2         1         

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        22.331657041s: 22 seconds, 331 milliseconds
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39m        82.295150125s: 1 minute, 22 seconds, 295 milliseconds


LoadError: InterruptException: