In [8]:
library(stringr)
library(glue)
library(purrr)
library(tidyverse)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mreadr  [39m 1.4.0
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mcollapse()[39m masks [34mglue[39m::collapse()
[31mx[39m [34mdplyr[39m::[32mfilter()[39m   masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m      masks [34mstats[39m::lag()



In [9]:
#Get the statistical results for a specific dataset and cutoff

# INPUTS:
# data: a data matrix in the format of "complete_genome_SARS-CoV-2_info_k={i}.txt", where i: the length of each k-mer
# colname: the name of the column where the cut will be applied [works for "Value" & "Godel_number"]
# pts: the total number of different cutoffs [also the number of points to be plotted]

# OUTPUTS: the p-values of the three statistical tests-> wilcoxon, fisher, kolmogorov-smirnov

stats_cutoff<- function(data, colname, pts){
    
    # Default = "Value"
    if(missing(colname)){
        colname = 'Value'
    }
    
    # Default = 9 points
    if (missing(pts)){
        pts = 9
    }
    
    column_cut <- select(data, colname)
    
    min_col = min(column_cut)
    max_col = max(column_cut)
    cuts = pts + 2 - 1
    seg = (max_col - min_col)/cuts
    seq_cuts = min_col + seg*(1:pts)   
    
    wilcoxon <- matrix(, nrow = 3, ncol = pts, dimnames = list(c('Low-High', 'Low-Total', 'High-Total'), seq_cuts))
    fisher <- matrix(, nrow = 3, ncol = pts, dimnames = list(c('Low-High', 'Low-Total', 'High-Total'), seq_cuts))
    kolm_smirnov <- matrix(, nrow = 3, ncol = pts, dimnames = list(c('Low-High', 'Low-Total', 'High-Total'), seq_cuts))
    
    #STATISTICAL TESTS [WILCOXON, FISHER, KOLMOGOROV-SMIRNOV]  
    
    for (cut in 1:pts){
        
        low_cut <- data[column_cut < seq_cuts[cut], ]
        high_cut <- data[column_cut >= seq_cuts[cut], ]

        if (colname == 'Value'){
            low = table(low_cut[, 2])
            high = table(high_cut[, 2])
            total = table(data[, 2])
        }
        if (colname == 'Godel_number'){
            low = low_cut[, 3]
            high = high_cut[, 3]
            total = data[, 3]
        }        
    
        #Wilcoxon Test
        if (length(low)>0 & length(high)>0) {wilcoxon[1, cut] = wilcox.test(low, high)$p.value}
            else wilcoxon[1, cut] = -1
        if (length(low)>0 & length(high)>0) {wilcoxon[2, cut] = wilcox.test(low, total)$p.value}
            else wilcoxon[2, cut] = -1
        if (length(low)>0 & length(high)>0) {wilcoxon[3, cut] = wilcox.test(high, total)$p.value}
            else wilcoxon[3, cut] = -1

        
        #Fisher Test <- returns -1 if the length of one of the vectors is 0 or 1
        if (length(low)>1 & length(high)>1) {fisher[1, cut] = var.test(low, high)$p.value}
            else fisher[1, cut] = -1
        if (length(low)>1 & length(total)>1) {fisher[2, cut] = var.test(low, total)$p.value}
            else fisher[2, cut] = -1
        if (length(high)>1 & length(total)>1) {fisher[3, cut] = var.test(high, total)$p.value}
            else fisher[3, cut] = -1


        #Kolmogorov-Smirnov
        if (length(low)>0 & length(high)>0) {kolm_smirnov[1, cut] = ks.test(low, high)$p.value}
            else kolm_smirnov[1, cut] = -1
        if (length(low)>0 & length(high)>0) {kolm_smirnov[2, cut] = ks.test(low, total)$p.value}
            else kolm_smirnov[2, cut] = -1   
        if (length(low)>0 & length(high)>0) {kolm_smirnov[3, cut] = ks.test(high, total)$p.value}
            else kolm_smirnov[3, cut] = -1
        }

    return (list(wilcoxon, fisher, kolm_smirnov))
}

In [133]:
#Main
options(warn=-1)

#Choose number of points - pts
pts = 20
#Choose length of kmers - k
for (k in c(10, 15, 20)){ 

    #Load data
    data_path = file.path('C:','Users', 'user', 'Desktop', 'Workspaces', 'R', 'INAB', 'data', glue('sars_1000_info_k={k}.txt'))
    data <- read.table(data_path)

    #Filter data for 'ACTG'
    filt_data <- data[str_detect(data[, 1], '^[ACTG]+$'),]

    stats_mult = stats_cutoff(filt_data, colname = 'Value', pts)
    stats_mult
    stats_godel = stats_cutoff(filt_data, colname = 'Godel_number', pts)
    stats_godel

    #Save results
    res_path = file.path('C:','Users', 'user', 'Desktop', 'Workspaces', 'R', 'INAB', 'results')
    txt_path = file.path(res_path, 'txt')

    write.csv(stats_mult[1], glue('{txt_path}/stats_mult_wilcox_k={k}_pts={pts}'))
    write.csv(stats_mult[2], glue('{txt_path}/stats_mult_fisher_k={k}_pts={pts}'))
    write.csv(stats_mult[3], glue('{txt_path}/stats_mult_kolm_k={k}_pts={pts}'))

    write.csv(stats_godel[1], glue('{txt_path}/stats_godel_wilcox_k={k}_pts={pts}'))
    write.csv(stats_godel[2], glue('{txt_path}/stats_godel_fisher_k={k}_pts={pts}'))
    write.csv(stats_godel[3], glue('{txt_path}/stats_godel_kolm_k={k}_pts={pts}'))

}    