In [19]:
using CSV
using DataFrames
#using DataFramesMeta
using Plots

# Read alignment tables of S58

In [20]:
#df = CSV.read("file.csv", DataFrame) ## a faster way
rd58_df = DataFrame(CSV.File("S94_RD58_stat_summary.xls"))
rd58_50bp_df = DataFrame(CSV.File("S94_RD58_50bp_stat_summary.xls"))
rd58_50bp_1m_df = DataFrame(CSV.File("S94_RD58_50bp_1M_stat_summary.xls"))
rd58_50bp_2m_df = DataFrame(CSV.File("S94_RD58_50bp_2M_stat_summary.xls"))
rd58_50bp_5m_df = DataFrame(CSV.File("S94_RD58_50bp_5M_stat_summary.xls"))

Unnamed: 0_level_0,sample,species,reads,ribo_reads,ercc_reads,ercc_reads(%)
Unnamed: 0_level_1,String,String,Int64,String,String,String
1,RD58-100pg_S7_50bp_5M_L002,Mus_musculus,5000000,,,
2,RD58-10ng_S10_50bp_5M_L002,Mus_musculus,5000000,,,
3,RD58-10pg_S6_50bp_5M_L002,Mus_musculus,2216467,,,
4,RD58-1ng_S9_50bp_5M_L002,Mus_musculus,5000000,,,
5,RD58-500pg_S8_50bp_5M_L002,Mus_musculus,5000000,,,


# A summary table of S58

In [21]:
#merge tables
rd58_df = vcat(rd58_df, rd58_50bp_df, rd58_50bp_1m_df, rd58_50bp_2m_df, rd58_50bp_5m_df)

#remove "_L002"
rd58_df[!, "sample"] = replace.(rd58_df[!, "sample"], r"_L002" => "")

#sort rows in predefined order: S6, S7,..., S10
myOrder = [
    "RD58-10pg_S6", 
    "RD58-100pg_S7", 
    "RD58-500pg_S8", 
    "RD58-1ng_S9", 
    "RD58-10ng_S10",
    
    "RD58-10pg_S6_50bp", 
    "RD58-100pg_S7_50bp", 
    "RD58-500pg_S8_50bp", 
    "RD58-1ng_S9_50bp",
    "RD58-10ng_S10_50bp",
    
    "RD58-10pg_S6_50bp_1M", 
    "RD58-100pg_S7_50bp_1M", 
    "RD58-500pg_S8_50bp_1M", 
    "RD58-1ng_S9_50bp_1M", 
    "RD58-10ng_S10_50bp_1M",
    
    "RD58-10pg_S6_50bp_2M", 
    "RD58-100pg_S7_50bp_2M", 
    "RD58-500pg_S8_50bp_2M", 
    "RD58-1ng_S9_50bp_2M", 
    "RD58-10ng_S10_50bp_2M", 
    
    "RD58-100pg_S7_50bp_5M", 
    "RD58-500pg_S8_50bp_5M", 
    "RD58-1ng_S9_50bp_5M", 
    "RD58-10ng_S10_50bp_5M"
]
rd58_df = rd58_df[indexin(myOrder, rd58_df.sample),:]

Unnamed: 0_level_0,sample,species,reads,ribo_reads,ercc_reads,ercc_reads(%)
Unnamed: 0_level_1,String,String,Int64,String,String,String
1,RD58-10pg_S6,Mus_musculus,2216467,,,
2,RD58-100pg_S7,Mus_musculus,6096257,,,
3,RD58-500pg_S8,Mus_musculus,6870582,,,
4,RD58-1ng_S9,Mus_musculus,5381029,,,
5,RD58-10ng_S10,Mus_musculus,12590390,,,
6,RD58-10pg_S6_50bp,Mus_musculus,2216467,,,
7,RD58-100pg_S7_50bp,Mus_musculus,6096257,,,
8,RD58-500pg_S8_50bp,Mus_musculus,6870582,,,
9,RD58-1ng_S9_50bp,Mus_musculus,5381029,,,
10,RD58-10ng_S10_50bp,Mus_musculus,12590390,,,


# Plots of S58

In [43]:
# An example of data plots
gr()
plot(rd58_df[!, "sample"], 
    rd58_df[!, "clean_reads"], 
    marker = :circle,
    legend = :topleft, 
    background_color_legend = :transparent,
    #foreground_color_legend = nothing,
    xlabel = "sample", 
    ylabel = "clean reads", 
    label = "clean",
    #label=false, 
    right_margin = 15Plots.mm,
    xrotation = 45, 
    xtickfont = font(6),
    xticks = :all)

newColumn = parse.(Float64, replace.(rd58_df[!, "discarded(%)"], r"%" => ""))
plot!(twinx(),
    newColumn,
    marker = :circle,
    legend = :topright,
    background_color_legend = :transparent,
    #foreground_color_legend = nothing,
    ylabel ="reads discarded(%)",
    #ylim = (0, 100),
    label = "discarded",
    xaxis = nothing,
    color = :red,
    right_margin = 15Plots.mm
)
#save figures
#savefig("filename.pdf")

In [46]:
plotlyjs()

Plots.PlotlyJSBackend()

In [51]:
# uniquely_mapped
newColumn = parse.(Float64, replace.(rd58_df[!, "uniquely_mapped"], r"%" => ""))
plot(rd58_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Uniquely mapped(%)",
    ylim = (0, 100),
    label = false, 
    xrotation = 45, 
    xticks = :all)
#savefig("filename2.pdf")

In [25]:
# duplicate_rate
newColumn = parse.(Float64, replace.(rd58_df[!, "duplicate_rate"], r"%" => ""))
plot(rd58_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Duplicate rate(%)",
    ylim = (0, 100),
    label = false, 
    xrotation = 45, 
    xticks = :all)

In [26]:
# Total genes detected
newColumn = rd58_df[!, "protein-coding_gene_mapped(21859)"]
plot(rd58_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Total genes detected",
    label = false, 
    xrotation = 45, 
    xticks = :all)

# Read alignment tables of S59

In [27]:
# Start the work for S59 As above
rd59_df = DataFrame(CSV.File("S94_RD59_stat_summary.xls"))
rd59_50bp_df = DataFrame(CSV.File("S94_RD59_50bp_stat_summary.xls"))
rd59_50bp_1m_df = DataFrame(CSV.File("S94_RD59_50bp_1M_stat_summary.xls"))
rd59_50bp_2m_df = DataFrame(CSV.File("S94_RD59_50bp_2M_stat_summary.xls"))
rd59_50bp_5m_df = DataFrame(CSV.File("S94_RD59_50bp_5M_stat_summary.xls"))

Unnamed: 0_level_0,sample,species,reads,ribo_reads,ercc_reads,ercc_reads(%)
Unnamed: 0_level_1,String,String,Int64,String,String,String
1,RD59-1-ng_S4_50bp_5M_L002,Homo_sapiens,5000000,,,
2,RD59-10-pg_S1_50bp_5M_L002,Homo_sapiens,5000000,,,
3,RD59-100-pg_S2_50bp_5M_L002,Homo_sapiens,5000000,,,
4,RD59-25-ng_S5_50bp_5M_L002,Homo_sapiens,5000000,,,
5,RD59-500-pg_S3_50bp_5M_L002,Homo_sapiens,5000000,,,


# A summary table of S59

In [28]:
rd59_df = vcat(rd59_df, rd59_50bp_df, rd59_50bp_1m_df, rd59_50bp_2m_df, rd59_50bp_5m_df)

rd59_df[!, "sample"]  = replace.(replace.(replace.(rd59_df[!, "sample"], r"_L002" => ""), r"-ng" => "ng"), r"-pg" => "pg")

myOrder2 = [
     "RD59-10pg_S1",
     "RD59-100pg_S2",
     "RD59-500pg_S3",
     "RD59-1ng_S4",
     "RD59-25ng_S5",
    
     "RD59-10pg_S1_50bp",
     "RD59-100pg_S2_50bp",
     "RD59-500pg_S3_50bp",
     "RD59-1ng_S4_50bp",
     "RD59-25ng_S5_50bp",
    
     "RD59-10pg_S1_50bp_1M",
     "RD59-100pg_S2_50bp_1M",
     "RD59-500pg_S3_50bp_1M",
     "RD59-1ng_S4_50bp_1M",
     "RD59-25ng_S5_50bp_1M",

     "RD59-10pg_S1_50bp_2M",
     "RD59-100pg_S2_50bp_2M",
     "RD59-500pg_S3_50bp_2M",
     "RD59-1ng_S4_50bp_2M",
     "RD59-25ng_S5_50bp_2M",

     "RD59-10pg_S1_50bp_5M",
     "RD59-100pg_S2_50bp_5M",
     "RD59-500pg_S3_50bp_5M",
     "RD59-1ng_S4_50bp_5M",
     "RD59-25ng_S5_50bp_5M",
]

rd59_df = rd59_df[indexin(myOrder2, rd59_df.sample),:]

Unnamed: 0_level_0,sample,species,reads,ribo_reads,ercc_reads,ercc_reads(%)
Unnamed: 0_level_1,String,String,Int64,String,String,String
1,RD59-10pg_S1,Homo_sapiens,11787482,,,
2,RD59-100pg_S2,Homo_sapiens,15932075,,,
3,RD59-500pg_S3,Homo_sapiens,7778651,,,
4,RD59-1ng_S4,Homo_sapiens,9324827,,,
5,RD59-25ng_S5,Homo_sapiens,16158726,,,
6,RD59-10pg_S1_50bp,Homo_sapiens,11787482,,,
7,RD59-100pg_S2_50bp,Homo_sapiens,15932075,,,
8,RD59-500pg_S3_50bp,Homo_sapiens,7778651,,,
9,RD59-1ng_S4_50bp,Homo_sapiens,9324827,,,
10,RD59-25ng_S5_50bp,Homo_sapiens,16158726,,,


# Plots of S59

In [29]:
#uniquely_mapped
newColumn = parse.(Float64, replace.(rd59_df[!, "uniquely_mapped"], r"%" => ""))
plot(rd59_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Uniquely mapped(%)",
    ylim = (0, 100),
    label = false, 
    xrotation = 45, 
    xticks = :all)

In [30]:
#duplicate_rate
newColumn = parse.(Float64, replace.(rd59_df[!, "duplicate_rate"], r"%" => ""))
plot(rd59_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Duplicate rate(%)",
    ylim = (0, 100),
    label = false, 
    xrotation = 45, 
    xticks = :all)

In [31]:
#Total genes detected
newColumn = rd59_df[!, "protein-coding_gene_mapped(19954)"]
plot(rd59_df[!, "sample"], 
    newColumn, 
    marker = :circle,
    xlabel = "Input", 
    ylabel = "Total genes detected",
    label = false, 
    xrotation = 45, 
    xticks = :all)

# Plots of the '150bp' groups in S58 and S59

In [32]:
rd58_150bp_unique = rd58_df[!, ["sample", "uniquely_mapped"]][1:5, :]
rd59_150bp_unique = rd59_df[!, ["sample", "uniquely_mapped"]][1:5, :]
rd_150bp_unique_merge = hcat(rd58_150bp_unique, rd59_150bp_unique, makeunique=true)

unique1 = parse.(Float64, replace.(rd_150bp_unique_merge[!, "uniquely_mapped"], r"%" => ""))
unique2 = parse.(Float64, replace.(rd_150bp_unique_merge[!, "uniquely_mapped_1"], r"%" => ""))

plot(
    [unique1, unique2], 
    marker = :circle,
    xlabel = "RNA input", 
    ylabel = "uniquely mapped (%)", 
    ylim = (0, 100),
    label = ["RD58" "RD59"],
    legend=:right,
    xrotation = 45,
    xticks = (1:5, ("10pg", "100pg", "500pg", "1ng", "10ng/25ng"))
    )

In [33]:
rd58_150bp_duplicate = rd58_df[!, ["sample", "duplicate_rate"]][1:5, :]
rd59_150bp_duplicate = rd59_df[!, ["sample", "duplicate_rate"]][1:5, :]
rd_150bp_duplicate_merge = hcat(rd58_150bp_duplicate, rd59_150bp_duplicate, makeunique=true)

duplicate1 = parse.(Float64, replace.(rd_150bp_duplicate_merge[!, "duplicate_rate"], r"%" => ""))
duplicate2 = parse.(Float64, replace.(rd_150bp_duplicate_merge[!, "duplicate_rate_1"], r"%" => ""))

plot(
    [duplicate1, duplicate2], 
    marker = :circle,
    xlabel = "RNA input", 
    ylabel = "duplicate rate (%)", 
    ylim = (0, 100),
    label = ["RD58" "RD59"],
    legend=:right,
    xrotation = 45,
    xticks = (1:5, ("10pg", "100pg", "500pg", "1ng", "10ng/25ng"))
    )

In [34]:
rd58_150bp_gene = rd58_df[!, ["sample", "protein-coding_gene_mapped(21859)"]][1:5, :]
rd59_150bp_gene = rd59_df[!, ["sample", "protein-coding_gene_mapped(19954)"]][1:5, :]
rd_150bp_gene_merge = hcat(rd58_150bp_gene, rd59_150bp_gene, makeunique=true)

gene1 = rd_150bp_gene_merge[!, "protein-coding_gene_mapped(21859)"]
gene2 = rd_150bp_gene_merge[!, "protein-coding_gene_mapped(19954)"]

plot(
    [gene1, gene2], 
    marker = :circle,
    xlabel = "RNA input", 
    ylabel = "protein-coding genes detected", 
    label = ["RD58" "RD59"],
    legend=:right,
    xrotation = 45,
    xticks = (1:5, ("10pg", "100pg", "500pg", "1ng", "10ng/25ng"))
    )

# Plots of the '50bp' groups in S58 and S59

In [35]:
# Total genes detected
rd58_50bp_gene = rd58_df[!, ["sample", "protein-coding_gene_mapped(21859)"]][6:end, :]
allowmissing!(rd58_50bp_gene)
insert!.(eachcol(rd58_50bp_gene), 16, ["RD58-10pg_S6_50bp_5M", missing])
#rd58_50bp_gene

rd59_50bp_gene = rd59_df[!, ["sample", "protein-coding_gene_mapped(19954)"]][6:end, :]

rd_50bp_gene_merge = hcat(rd58_50bp_gene, rd59_50bp_gene, makeunique=true)
#names(rd_50bp_gene_merge)

Unnamed: 0_level_0,sample,protein-coding_gene_mapped(21859),sample_1
Unnamed: 0_level_1,String?,Int64?,String
1,RD58-10pg_S6_50bp,10573,RD59-10pg_S1_50bp
2,RD58-100pg_S7_50bp,13988,RD59-100pg_S2_50bp
3,RD58-500pg_S8_50bp,15339,RD59-500pg_S3_50bp
4,RD58-1ng_S9_50bp,15466,RD59-1ng_S4_50bp
5,RD58-10ng_S10_50bp,16462,RD59-25ng_S5_50bp
6,RD58-10pg_S6_50bp_1M,10212,RD59-10pg_S1_50bp_1M
7,RD58-100pg_S7_50bp_1M,13210,RD59-100pg_S2_50bp_1M
8,RD58-500pg_S8_50bp_1M,13912,RD59-500pg_S3_50bp_1M
9,RD58-1ng_S9_50bp_1M,14072,RD59-1ng_S4_50bp_1M
10,RD58-10ng_S10_50bp_1M,14063,RD59-25ng_S5_50bp_1M


In [38]:
# Total genes detected
a = rd58_df[!, ["sample", "protein-coding_gene_mapped(21859)"]][6:end, :]
b = rd59_df[!, ["sample", "protein-coding_gene_mapped(19954)"]][6:end, :]

#ab = hcat(a, b, makeunique=true)

#outerjoin(grades_2020(), grades_2021(); on=:name)
outerjoin(a, b; on=:sample)

Unnamed: 0_level_0,sample,protein-coding_gene_mapped(21859),protein-coding_gene_mapped(19954)
Unnamed: 0_level_1,String,Int64?,Int64?
1,RD58-10pg_S6_50bp,10573,missing
2,RD58-100pg_S7_50bp,13988,missing
3,RD58-500pg_S8_50bp,15339,missing
4,RD58-1ng_S9_50bp,15466,missing
5,RD58-10ng_S10_50bp,16462,missing
6,RD58-10pg_S6_50bp_1M,10212,missing
7,RD58-100pg_S7_50bp_1M,13210,missing
8,RD58-500pg_S8_50bp_1M,13912,missing
9,RD58-1ng_S9_50bp_1M,14072,missing
10,RD58-10ng_S10_50bp_1M,14063,missing


In [36]:
gene1_50bp = rd_50bp_gene_merge[!, "protein-coding_gene_mapped(21859)"]
gene2_50bp = rd_50bp_gene_merge[!, "protein-coding_gene_mapped(19954)"]

input = ["10pg", "100pg", "500pg", "1ng", "10ng/25ng"]
subsample = ["all", "1M", "2M", "5M"]
groups = ["$i-$j" for i in subsample for j in input]

plot(
    [gene1_50bp, gene2_50bp], 
    marker = :circle,
    xlabel = "RNA input", 
    ylabel = "protein-coding genes detected", 
    label = ["RD58-50bp" "RD59-50bp"],
    legend=:top,
    xrotation = 45,
    xticks = (1:length(groups), groups)
    )