From 1ad4281d2a4ff3a36814a6f8fad56b68e608b9cb Mon Sep 17 00:00:00 2001
From: Graeme Ford <graeme.ford@tuks.co.za>
Date: Sat, 17 Feb 2024 13:25:49 +0200
Subject: [PATCH] Split the reportFreq rule into one-rule--one-command

---
 workflow/Snakefile | 146 +++++++++++++++++++++++++++++++--------------
 1 file changed, 102 insertions(+), 44 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 48d4d59..4d06832 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -43,10 +43,10 @@ transcripts = read_csv(join("input", "transcripts.csv"), header=0)
 
 
 # DEFINE CONTEXT-VARIABLES:
-finalExtensions = ["acount", "hardy", "smiss", "vmiss"]  # "prune.in", "prune.out",
+# finalExtensions = ["acount", "hardy", "smiss", "vmiss"]  # "prune.in", "prune.out",
 clusters = set([cluster for cluster in samples.keys() if cluster not in ["sample_name", "dataset"]])
-bExtensions = ["bed", "bim", "fam"]
-tExtensions = ["map", "ped"]
+# bExtensions = ["bed", "bim", "fam"]
+# tExtensions = ["map", "ped"]
 
 
 include: "rules/common.smk"
@@ -352,14 +352,15 @@ rule filterLocations:
         fromBP=lambda wildcards: locations.loc[locations["location_name"] == wildcards.location, "start"].item(),
         toBP=lambda wildcards: locations.loc[locations["location_name"] == wildcards.location, "stop"].item(),
         chr=lambda wildcards: locations.loc[locations["location_name"] == wildcards.location, "chromosome"].item(),
+        out=lambda wildcards, output: output["vcf"][:-7]
     input:
-        "results/tmp/filterSampleRelatedness/All_filterSampleRelatedness.vcf.gz",
+        vcf="results/tmp/filterSampleRelatedness/All_filterSampleRelatedness.vcf.gz",
     output:
-        "results/tmp/filterLocations/{location}_filterLocations.vcf.gz",
+        vcf="results/tmp/filterLocations/{location}_filterLocations.vcf.gz",
     shell:
         """
         echo -e "\n--- LOG SECTION START | Plink-2 'trim' ---" 1>&2
-        plink2 --allow-extra-chr --vcf {input} --from-bp {params.fromBP} --to-bp {params.toBP} --chr {params.chr} --output-chr chr26 --export vcf-4.2 bgz --out results/TRIM/ALL_{wildcards.location}
+        plink2 --vcf {input} --allow-extra-chr --from-bp {params.fromBP} --to-bp {params.toBP} --chr {params.chr} --output-chr chr26 --export vcf-4.2 bgz --out {params.out}
         echo -e "--- LOG SECTION END | Plink-2 'trim' ---\n" 1>&2
         """
 
@@ -382,12 +383,58 @@ rule writeSampleMetadata:
     envmodules:
         config["environment"]["envmodules"]["python-3"]
     output:
-        "results/tmp/writeSampleMetadata/{cluster}_writeSampleMetadata.txt",
+        metadata="results/tmp/writeSampleMetadata/{cluster}_writeSampleMetadata.txt",
     script:
         join("scripts", "01-TRANSPILE_CLUSTERS.py")
 
 
-rule reportFreq:
+# rule reportFreq:
+#     """
+#     Perform Frequency analysis on super populations.
+#     """
+#     # group: "REPORT"
+#     log: "logs/PLINK/{location}.log",
+#     benchmark: "_benchmarks/reportFreq/{location}.benchmark"
+#     resources:
+#         cpus=search("cores", "reportFreq"),
+#         nodes=search("nodes", "reportFreq"),
+#         queue=search("queue", "reportFreq"),
+#         walltime=search("walltime", "reportFreq"),
+#     envmodules:
+#         config["environment"]["envmodules"]["plink-2"],
+#     params:
+#         prefix="ALL_{location}",
+#         subsets_list=lambda wildcards: ' '.join(clusters)
+#     input:
+#         [f"results/tmp/writeSampleMetadata/{cluster}_writeSampleMetadata.txt" for cluster in clusters],
+#         vcf="results/tmp/filterLocations/{location}_filterLocations.vcf.gz",
+#     output:
+#         [
+#             expand(
+#                 [
+#                     "results/FINAL/%s/ALL_{{location}}.%s.{extension}"
+#                     % (cluster, population)
+#                     for population in list(samples[cluster].unique())
+#                 ],
+#                 extension=finalExtensions,
+#             )
+#             for cluster in clusters
+#         ],
+#     shell:
+#         """
+#         for CLUSTER in {params.subsets_list}
+#         do
+#             echo -e "\n--- LOG SECTION START | Plink-2 'All freq' ---" 1>&2
+#             plink2 --allow-extra-chr --vcf {input.vcf} --freq counts --export vcf-4.2 bgz --out results/FINAL/$CLUSTER/{params.prefix}
+#             echo -e "--- LOG SECTION END | Plink-2 'All freq' ---\n" 1>&2
+
+#             echo -e "\n--- LOG SECTION START | Plink-2 'Subset freq' ---" 1>&2
+#             plink2 --allow-extra-chr --vcf {input.vcf} --pheno iid-only results/REFERENCE/cluster_$CLUSTER.txt --loop-cats $CLUSTER --freq counts --missing --hardy midp --out results/FINAL/$CLUSTER/{params.prefix}
+#             echo -e "--- LOG SECTION END | Plink-2 'Subset freq' ---\n" 1>&2
+#         done
+#         """
+
+rule reportFreqAllPerLocation:
     """
     Perform Frequency analysis on super populations.
     """
@@ -402,35 +449,50 @@ rule reportFreq:
     envmodules:
         config["environment"]["envmodules"]["plink-2"],
     params:
-        prefix="ALL_{location}",
-        subsets_list=lambda wildcards: ' '.join(clusters)
+        out=lambda wildcards,output: output["allele_count"][:-7],
+    input:
+        vcf="results/tmp/filterLocations/{location}_filterLocations.vcf.gz",
+    output:
+        allele_count="results/reportFreqAllPerLocation/All_{location}.acount",
+        hardy="results/reportFreqAllPerLocation/All_{location}.hardy",
+        sample_missingness="results/reportFreqAllPerLocation/All_{location}.smiss",
+        variant_missingness="results/reportFreqAllPerLocation/All_{location}.vmiss",
+    shell:
+        """
+        echo -e "\n--- LOG SECTION START | Plink-2 'All freq' ---" 1>&2
+        plink2 --vcf {input.vcf} --allow-extra-chr --split-par b38 --freq counts --export vcf-4.2 bgz --out {params.out}
+        echo -e "--- LOG SECTION END | Plink-2 'All freq' ---\n" 1>&2
+        """
+
+rule reportFreqPerClusterPerLocation:
+    """
+    Perform Frequency analysis on super populations.
+    """
+    # group: "REPORT"
+    log: "logs/PLINK/{cluster}_{location}.log",
+    benchmark: "_benchmarks/reportFreq/{cluster}_{location}.benchmark"
+    resources:
+        cpus=search("cores", "reportFreq"),
+        nodes=search("nodes", "reportFreq"),
+        queue=search("queue", "reportFreq"),
+        walltime=search("walltime", "reportFreq"),
+    envmodules:
+        config["environment"]["envmodules"]["plink-2"],
+    params:
+        out=lambda wildcards,output: output["allele_count"][:-7]
     input:
-        [f"results/tmp/writeSampleMetadata/{cluster}_writeSampleMetadata.txt" for cluster in clusters],
         vcf="results/tmp/filterLocations/{location}_filterLocations.vcf.gz",
+        cluster_samples="results/tmp/writeSampleMetadata/{cluster}_writeSampleMetadata.txt"
     output:
-        [
-            expand(
-                [
-                    "results/FINAL/%s/ALL_{{location}}.%s.{extension}"
-                    % (cluster, population)
-                    for population in list(samples[cluster].unique())
-                ],
-                extension=finalExtensions,
-            )
-            for cluster in clusters
-        ],
+        allele_count="results/reportFreqPerClusterPerLocation/{cluster}_{location}.acount",
+        hardy="results/reportFreqPerClusterPerLocation/{cluster}_{location}.hardy",
+        sample_missingness="results/reportFreqPerClusterPerLocation/{cluster}_{location}.smiss",
+        variant_missingness="results/reportFreqPerClusterPerLocation/{cluster}_{location}.vmiss",
     shell:
         """
-        for CLUSTER in {params.subsets_list}
-        do
-            echo -e "\n--- LOG SECTION START | Plink-2 'All freq' ---" 1>&2
-            plink2 --allow-extra-chr --vcf {input.vcf} --freq counts --export vcf-4.2 bgz --out results/FINAL/$CLUSTER/{params.prefix}
-            echo -e "--- LOG SECTION END | Plink-2 'All freq' ---\n" 1>&2
-
-            echo -e "\n--- LOG SECTION START | Plink-2 'Subset freq' ---" 1>&2
-            plink2 --allow-extra-chr --vcf {input.vcf} --pheno iid-only results/REFERENCE/cluster_$CLUSTER.txt --loop-cats $CLUSTER --freq counts --missing --hardy midp --out results/FINAL/$CLUSTER/{params.prefix}
-            echo -e "--- LOG SECTION END | Plink-2 'Subset freq' ---\n" 1>&2
-        done
+        echo -e "\n--- LOG SECTION START | Plink-2 'Subset freq' ---" 1>&2
+        plink2 --vcf {input.vcf} --allow-extra-chr --keep {input.cluster_samples} --freq counts --missing --hardy midp --out {params.out}
+        echo -e "--- LOG SECTION END | Plink-2 'Subset freq' ---\n" 1>&2
         """
 
 # group: VALIDATE
@@ -465,16 +527,12 @@ rule all:
         "logs/ALL/ALL.log",
     input:
         VCFValidation.rules.all.input,
+        expand("results/reportFreqPerClusterPerLocation/{cluster}_{location}.acount", cluster=clusters, location=locations["location_name"]),
+        expand("results/reportFreqPerClusterPerLocation/{cluster}_{location}.hardy", cluster=clusters, location=locations["location_name"]),
+        expand("results/reportFreqPerClusterPerLocation/{cluster}_{location}.smiss", cluster=clusters, location=locations["location_name"]),
+        expand("results/reportFreqPerClusterPerLocation/{cluster}_{location}.vmiss", cluster=clusters, location=locations["location_name"]),
+        expand("results/reportFreqAllPerLocation/All_{location}.acount", location=locations["location_name"]),
+        expand("results/reportFreqAllPerLocation/All_{location}.hardy", location=locations["location_name"]),
+        expand("results/reportFreqAllPerLocation/All_{location}.smiss", location=locations["location_name"]),
+        expand("results/reportFreqAllPerLocation/All_{location}.vmiss", location=locations["location_name"]),
         # PopulationStructure.rules.all.input,
-        [
-            expand(
-                [
-                    "results/FINAL/%s/ALL_{location}.%s.{extension}"
-                    % (cluster, population)
-                    for population in list(samples[cluster].unique())
-                ],
-                extension=finalExtensions,
-                location=locations["location_name"],
-            )
-            for cluster in clusters
-        ]