In [10]:
using CSV
using DataFrames
using Plots
using Statistics
using SurvivalAnalysis
using StatsPlots

In [None]:
# first we need to combine the clinical and  modelling data 

# this function will combine clinical and modelling data using the patient_ID/sample ID to join them
function combineData(clinDataFile,modelDataFile,outputFile)
    # load two CSV files
    clinDF=DataFrame(CSV.File(clinDataFile))
    modelDF=DataFrame(CSV.File(modelDataFile))
    
    # get the name of all patients with clinical data
    # this assumes we've modelled more patients than we have data for
    clinicalPatients=clinDF[!,:patient_ID]
#     rename!(clinDF,:Sample => :patient_ID)
    
    # perform an inner join to create a single DF and save it
    newDF= innerjoin(clinDF, modelDF, on = :patient_ID)
    CSV.write(outputFile, newDF)
end

function doBreakdown(thisDF,method) 
    if method=="AAPPBreakdown"
        # Here is where we calculate the AAPP cateegories and add them to the AAPP row.
        for row in eachrow(thisDF)
            AAPPVal="other"
            if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="AAPP"
            elseif row[:AA]
                AAPPVal="AA"
            elseif row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="PP"
            end
            row[:AAPPBreakdown]=AAPPVal
        end
    elseif method=="AAonly"
        # Here is where we calculate the AAPP cateegories and add them to the AAPP row.
        for row in eachrow(thisDF)
            AAPPVal="other"
            if row[:AA]==true 
                AAPPVal="AA"
            end
            row[:AAPPBreakdown]=AAPPVal
        end
    elseif method=="AAPPonly"
        # Here is where we calculate the AAPP cateegories and add them to the AAPP row.
        for row in eachrow(thisDF)
            AAPPVal="other"
            if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="AAPP"
            end
            row[:AAPPBreakdown]=AAPPVal
        end
    elseif method=="AAandOr"
        for row in eachrow(thisDF)
            AAPPVal="other"
            if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="AAPP"
            elseif row[:AA] || row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="AAorPP"
            end
            row[:AAPPBreakdown]=AAPPVal
        end 
    elseif method=="AAppOther"
        for row in eachrow(thisDF)
            AAPPVal="other"
            if row[:AA]==true 
                AAPPVal="AA"
            elseif row[Symbol(PPSpecies*"Thres")]==true
                AAPPVal="PP"
            end
            row[:AAPPBreakdown]=AAPPVal
        end
    elseif method=="AAppCluster"
        if "Cluster(number)" in names(thisDF)    
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal="AAPP"
                elseif row[Symbol("Cluster(number)")] in [2,3,5]
                    AAPPVal="235"
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        elseif "cluster_ICL" in names(thisDF) 
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal="AAPP"
                elseif row[Symbol("cluster_ICL")] in ["MYD88","NEC","BCL2"]
                    AAPPVal="MNB"
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        else
            println("can't find cluster info")
        end
    elseif method=="addAAPPToEachCluster"
        if "Cluster(number)" in names(thisDF)    
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal=repr(row[Symbol("Cluster(number)")])*"_AAPP"
                else 
                    AAPPVal=repr(row[Symbol("Cluster(number)")])
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        elseif "cluster_ICL" in names(thisDF) 
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal=row[Symbol("cluster_ICL")]*"_AAPP"
                else 
                    AAPPVal=row[Symbol("cluster_ICL")]
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        else
            println("can't find cluster info")
        end
    elseif method=="addAAToEachCluster"
        if "Cluster(number)" in names(thisDF)    
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true 
                    AAPPVal=repr(row[Symbol("Cluster(number)")])*"_AA"
                else 
                    AAPPVal=repr(row[Symbol("Cluster(number)")])
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        elseif "cluster_ICL" in names(thisDF) 
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[:AA]==true 
                    AAPPVal=row[Symbol("cluster_ICL")]*"_AA"
                else 
                    AAPPVal=row[Symbol("cluster_ICL")]
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        else
            println("can't find cluster info")
        end
    elseif method=="IPIAAPP"
        if "IPI(number)" in names(thisDF)    
            #0-1, 2,3,4-5
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[Symbol("IPI(number)")] in ["1","2"]
                    row[Symbol("IPI(number)")]=String3("1/2")
                elseif row[Symbol("IPI(number)")] in ["4","5"]
                    row[Symbol("IPI(number)")]=String3("4/5")
                end
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal=repr(row[Symbol("IPI(number)")])*"_AAPP"
                else 
                    AAPPVal=repr(row[Symbol("IPI(number)")])
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        elseif "IPI" in names(thisDF) 
            for row in eachrow(thisDF)
                AAPPVal="other"
                if row[Symbol("IPI")] in ["Low","Low/Intermediate"]
                    row[Symbol("IPI")]=String3("Low")
                else
                    row[Symbol("IPI")]=String3("Hi")
                end
               
                if row[:AA]==true && row[Symbol(PPSpecies*"Thres")]==true
                    AAPPVal=row[Symbol("IPI")]*"_AAPP"
                else 
                    AAPPVal=row[Symbol("IPI")]
                end
                row[:AAPPBreakdown]=AAPPVal
            end
        else
            println("can't find cluster info")
        end
    else  
        println("invalid method specified")
    end
    return thisDF
end

# this function will calculate the AAPP categories 
# we need and returnb a dataframe containing only this info 
# which we can use to make KM plots.
function generateDF(file,AASpecies,PPSpecies,PPSpeciesDirection,method)
    # load the DF and grab only the relevant columns
    thisDF=DataFrame(CSV.File(file))
    thisDF=thisDF[:, filter(x -> (x in ["patient_ID","OS_status","OS_time",PPSpecies,AASpecies[1],AASpecies[2]]), names(thisDF))]
    # we're going to calculate whether each value is over or below the mean
    # so we need the mean
    means=mean.(eachcol(thisDF[!,[Symbol(PPSpecies),Symbol(AASpecies[1]),Symbol(AASpecies[2])]]))

            
    # We'll add a bunch of columns showing whether each value is above or below the mean
    if(PPSpeciesDirection=="high")
        insertcols!(thisDF, Symbol(PPSpecies*"Thres") => thisDF[!,Symbol(PPSpecies)] .> means[1])
    else
        insertcols!(thisDF, Symbol(PPSpecies*"Thres") => thisDF[!,Symbol(PPSpecies)] .< means[1])
    end
    insertcols!(thisDF, Symbol(AASpecies[1]*"Thres") => thisDF[!,Symbol(AASpecies[1])] .> means[2])
    insertcols!(thisDF, Symbol(AASpecies[2]*"Thres") => thisDF[!,Symbol(AASpecies[2])] .> means[3])
    insertcols!(thisDF, :AA => thisDF[!,Symbol(AASpecies[1]*"Thres")] .== false .& thisDF[!,Symbol(AASpecies[2]*"Thres")] .== false)
    insertcols!(thisDF, :AAPPBreakdown=>"no")
    
    thisDF=doBreakdown(thisDF,method)
    return thisDF
end



In [None]:
#dataFiles = ["C1","C2strict","C2orig","C2complete","C3"]
dataFiles = ["C1","C2strict","C3"]
#dataFiles = ["C1"]

folder = "Cohort4/C4_check"
dataFiles = ["C4"]
AASpecies=["cSmac","cCytoC"]
PPSpecies="Cdh1"
PPSpeciesDirection="low" #high
metric= "OS" #"PFS"
breakdownMethods=["AAPPBreakdown","AAonly","AAandOr","AAPPonly","AAppOther"]
#,"AAppCluster","addAAPPToEachCluster","addAAToEachCluster","IPIAAPP"

for breakdownMethod in breakdownMethods
    # loop through each file above
    for file in dataFiles
        println("Generating "*file*" data...")
        
        # combine clinical and model data
        combineData(folder*"/"*file*"_clinicalData.csv",folder*"/"*file*"_modelResults.csv",file*".csv")
        #thisDF=generateDFAAPPBreakdown(file*".csv",AASpecies,PPSpecies)
        thisDF=generateDF(file*".csv",AASpecies,PPSpecies,PPSpeciesDirection,breakdownMethod)
        CSV.write(file*"_"*breakdownMethod*"_patientCategories.csv", thisDF)

        # generate the KM array using just the categories
        try
            KMPlotArray=thisDF[!,[Symbol(metric*"_time"),Symbol(metric*"_status"),:AAPPBreakdown]]
            filter!(Symbol(metric*"_time") => !isequal("N/A"), KMPlotArray)
            #save the results out to a TSV file that can be used in plotting tools
            CSV.write("KMArray"*file*"_"*breakdownMethod*".tsv", KMPlotArray,delim='\t')
            println(file*" data saved in KMArray"*file*".tsv")
        catch e
            println("Metric: "*metric*" not found in "*file*", skipping")
            println(e)
        end    
    end
end

In [None]:
# now well do stack bar graphs of subgroups in each cohort
colorSchemes=[[:purple,:deepskyblue2,:darkorange1,:mediumseagreen,:firebrick1],
    [:darkorange1,:purple,:firebrick1,:mediumseagreen]]
dataSets=["C1_AAPPonly_patientCategories.csv","C2strict_AAPPonly_patientCategories.csv"]
COONames=["cell_of_origin","COO"]
index=1
for dataSet in dataSets
    thisDF=DataFrame(CSV.File(dataSet))
    clusterName=""
    if "cluster_ICL" in names(thisDF)
        clusterName="cluster_ICL"
    else
        clusterName="Cluster(number)"
    end
    countsOfUniqueAAPP=combine(groupby(filter(:AAPPBreakdown => n -> n == "AAPP", thisDF), Symbol(clusterName)), Symbol("AAPPBreakdown") => length  => :n_distinct_c)
    countsOfUnique=combine(groupby(thisDF, Symbol(clusterName)), Symbol("AAPPBreakdown") => length  => :n_distinct_c)
    
    countsOfUniqueAAPP[:,Symbol("n_distinct_c")] = countsOfUniqueAAPP[:,Symbol("n_distinct_c")].*1.0
    countsOfUnique[:,Symbol("n_distinct_c")] = convert(Array{Float64,1},countsOfUnique[:,Symbol("n_distinct_c")])
    
    
    normalisedCountsAAPP=countsOfUniqueAAPP[:,Symbol("n_distinct_c")]./Float64(sum(countsOfUniqueAAPP[:,Symbol("n_distinct_c")]))
    normalisedCounts=countsOfUnique[:,Symbol("n_distinct_c")]./Float64(sum(countsOfUnique[:,Symbol("n_distinct_c")]))
    plottingArray=hcat(normalisedCounts,normalisedCountsAAPP)
    thisColorScheme=colorSchemes[index]
    ctg = repeat(countsOfUniqueAAPP[:,Symbol(clusterName)], inner = 2)
    colorCustom=repeat(thisColorScheme,inner=2)
#   p1=groupedbar(plottingArray', bar_position = :stack, bar_width=0.7,group=countsOfUniqueAAPP[:,Symbol(clusterName)])
    p1=groupedbar(plottingArray', bar_position = :stack, bar_width=0.7,group=ctg,legend=:outerright,linewidth=0,color=colorCustom)
    plot!(p1,xticks=false,size=(300,600),ylims=[-0.1,1.1])
    display(p1)
    index=index+1
    
        display(countsOfUnique)
        display(countsOfUniqueAAPP)
end
    #     groupedbar(thisDF., group=thisDF[:,Symbol(ClusterName)] bar_position = :stack, bar_width=0.7)
    

In [None]:
# now well do stack bar graphs of subgroups in each cohort
colorSchemes=[[:darkgray,:darkorange1,:gray,:mediumseagreen,],
    [:mediumseagreen,:gray,:darkorange1,:darkgray]]
dataSets=["C1_AAPPonly_patientCategories.csv","C2strict_AAPPonly_patientCategories.csv"]
COONames=["cell_of_origin","COO"]
index=1
for dataSet in dataSets
    thisDF=DataFrame(CSV.File(dataSet))
    clusterName=""
    if "cell_of_origin" in names(thisDF)
        clusterName="cell_of_origin"
    else
        clusterName="COO"
    end
    countsOfUniqueAAPP=combine(groupby(filter(:AAPPBreakdown => n -> n == "AAPP", thisDF), Symbol(clusterName)), Symbol("AAPPBreakdown") => length  => :n_distinct_c)
    countsOfUnique=combine(groupby(thisDF, Symbol(clusterName)), Symbol("AAPPBreakdown") => length  => :n_distinct_c)
    
    countsOfUniqueAAPP[:,Symbol("n_distinct_c")] = countsOfUniqueAAPP[:,Symbol("n_distinct_c")].*1.0
    countsOfUnique[:,Symbol("n_distinct_c")] = convert(Array{Float64,1},countsOfUnique[:,Symbol("n_distinct_c")])
    
    
    normalisedCountsAAPP=countsOfUniqueAAPP[:,Symbol("n_distinct_c")]./Float64(sum(countsOfUniqueAAPP[:,Symbol("n_distinct_c")]))
    normalisedCounts=countsOfUnique[:,Symbol("n_distinct_c")]./Float64(sum(countsOfUnique[:,Symbol("n_distinct_c")]))
    plottingArray=hcat(normalisedCounts,normalisedCountsAAPP)
    thisColorScheme=colorSchemes[index]
    ctg = repeat(countsOfUniqueAAPP[:,Symbol(clusterName)], inner = 2)
    colorCustom=repeat(thisColorScheme,inner=2)
#   p1=groupedbar(plottingArray', bar_position = :stack, bar_width=0.7,group=countsOfUniqueAAPP[:,Symbol(clusterName)])
    p1=groupedbar(plottingArray', bar_position = :stack, bar_width=0.7,group=ctg,legend=:outerright,linewidth=0,color=colorCustom)
    plot!(p1,xticks=false,size=(300,600),ylims=[-0.1,1.1])
    display(p1)
    index=index+1
    display(countsOfUnique)
        display(countsOfUniqueAAPP)
end
    #     groupedbar(thisDF., group=thisDF[:,Symbol(ClusterName)] bar_position = :stack, bar_width=0.7)
    

In [None]:
    # AAPPdata = DataFrame(Y = thisDF[in(["AAPP"]).(thisDF.AAPPBreakdown),Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[in(["AAPP"]).(thisDF.AAPPBreakdown),Symbol("Progression_Free_Status(string)")].=="Progressed")
# f1 = kaplan_meier(@formula(Srv(Y, D) ~ 1), AAPPdata)
# p1=plot(f1,title="AAPP pfs",label="AAPP")


# AAPPdata = DataFrame(Y = thisDF[in(["AAPP"]).(thisDF.AAPPBreakdown),Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[in(["AAPP"]).(thisDF.AAPPBreakdown),Symbol("Progression_Free_Status(string)")].=="Progressed")
# f1 = kaplan_meier(@formula(Srv(Y, D) ~ 1), AAPPdata)
# p1=plot(f1,title="AAPP pfs",label="AAPP",lc=:green)


# PPdata = DataFrame(Y = thisDF[in(["PP"]).(thisDF.AAPPBreakdown),Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[in(["PP"]).(thisDF.AAPPBreakdown),Symbol("Progression_Free_Status(string)")].=="Progressed")
# f2 = kaplan_meier(@formula(Srv(Y, D) ~ 1), PPdata)
# p2=plot!(f2,label="PP",lc=:black)

# AAdata = DataFrame(Y = thisDF[in(["AA"]).(thisDF.AAPPBreakdown),Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[in(["AA"]).(thisDF.AAPPBreakdown),Symbol("Progression_Free_Status(string)")].=="Progressed")
# f3 = kaplan_meier(@formula(Srv(Y, D) ~ 1), AAdata)
# p2=plot!(f3,label="AA",lc=:orange)

# otherdata = DataFrame(Y = thisDF[in(["other"]).(thisDF.AAPPBreakdown),Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[in(["other"]).(thisDF.AAPPBreakdown),Symbol("Progression_Free_Status(string)")].=="Progressed")
# f4 = kaplan_meier(@formula(Srv(Y, D) ~ 1), otherdata)
# p2=plot!(f4,label="other",lc=:blue)

# data = DataFrame(Y = thisDF[:,Symbol("Progress_Free_Survival_Months(number)")], D = thisDF[:,Symbol("Progression_Free_Status(string)")].=="Progressed")
# f = kaplan_meier(@formula(Srv(Y, D) ~ 1), data)
# plot(f,title="overall pfs")