# Step 2:
## Extracting SNPs (Variants) from PGS Catalog Score Files
### By: Saniya Khullar :)

In [15]:
import os
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict
import math


In [None]:
parent_file = "/Users/sk37/Desktop/another_demo/"
input_file = parent_file + "score_file/PGS002724.txt"
dna_nexus_project_folder = "IBD_Projects_Saniya:/another_saniya_demo/" # the main folder we are working with in dnanexus
snp_threshold = 5000  # max SNPs per file



In [5]:
dir_name = os.path.dirname(os.path.abspath(input_file))
print(dir_name)

# Read file
with open(input_file, "r") as f:
    lines = f.readlines()

# Extract pgs_id and genome_build from header
pgs_id = None
genome_build = None
for line in lines:
    if line.startswith("#pgs_id="):
        pgs_id = line.strip().split("=")[1]
    elif line.startswith("#genome_build="):
        genome_build = line.strip().split("=")[1]

if not pgs_id or not genome_build:
    raise ValueError("pgs_id or genome_build missing in file header.")
print(f":) Please note: pgs_id = {pgs_id} and genome_build = {genome_build}")


/Users/sk37/Desktop/another_demo/score_file
:) Please note: pgs_id = PGS002724 and genome_build = GRCh37


In [7]:
# Extract SNP table (start at header "rsID")
snp_lines = []
header_found = False
for line in tqdm(lines, desc = ":) going through each line to find start of score data"):
    if line.startswith("#") == False:
    # if line.startswith("rsID"):
        header_found = True
        snp_lines.append(line.strip())  # include header
        continue
    if header_found:
        if line.strip() and not line.startswith("#"):
            snp_lines.append(line.strip())

# Save SNP table to TSV
output_tsv = os.path.join(dir_name, f"{pgs_id}_SNPs_{genome_build}.tsv")
with open(output_tsv, "w") as f:
    f.write("\n".join(snp_lines) + "\n")

print(f"Saved SNP table to {output_tsv}")

:) going through each line to find start of score data: 100%|██████████| 1213589/1213589 [00:00<00:00, 3167175.66it/s]


Saved SNP table to /Users/sk37/Desktop/another_demo/score_file/PGS002724_SNPs_GRCh37.tsv


In [8]:
# Create subdirectory for per-chromosome files
subdir = os.path.join(dir_name, f"per_chrom_snp_files_{pgs_id}")
subdir1 = os.path.join(subdir, "ranges_group")

os.makedirs(subdir, exist_ok=True)
os.makedirs(subdir1, exist_ok=True)

# Ordered dicts for uniqueness while preserving input order
chrom_ranges = defaultdict(dict)

chrom_found_list = []

# Collect SNPs
for row in tqdm(snp_lines[1:]):  # skip header
    chr_name, chr_pos, *_ = row.split("\t")
    chr_pos = int(chr_pos)

    # Format chromosome with leading zero if < 10
    chr_fmt = f"{int(chr_name):02d}" if chr_name.isdigit() and int(chr_name) < 10 else chr_name

    chrom_found_list.append(chr_name)

    # Use dict keys for uniqueness, keeps insertion order
    chrom_ranges[chr_name][f"{chr_fmt}:{chr_pos}-{chr_pos}"] = None

# Write unique SNPs per chromosome, split into multiple files if needed
for chr_name, snps_dict in tqdm(chrom_ranges.items()):
    snps_list = list(snps_dict.keys())
    total_snps = len(snps_list)
    print(f":) Please note that chromosome {chr_name} has {total_snps:,} total {genome_build} human variants or SNPs for score file {pgs_id}")
    if total_snps <= snp_threshold:
        # Single file
        chr_file_ranges = os.path.join(subdir1, f"{pgs_id}_chr{chr_name}_{genome_build}_ranges.txt")
        with open(chr_file_ranges, "w") as cf:
            for entry in snps_list:
                cf.write(entry + "\n")
    else:
        # Split into multiple files
        num_parts = math.ceil(total_snps / snp_threshold)
        for i in range(num_parts):
            start_idx = i * snp_threshold
            end_idx = min(start_idx + snp_threshold, total_snps)
            part_snps = snps_list[start_idx:end_idx]
            
            chr_file_ranges = os.path.join(
                subdir1, 
                f"{pgs_id}_chr{chr_name}_{genome_build}_ranges_p{i+1}_of_{num_parts}.txt"
            )
            with open(chr_file_ranges, "w") as cf:
                for entry in part_snps:
                    cf.write(entry + "\n")

print(f"Created per-chromosome SNP coordinate files in {subdir}")


100%|██████████| 1213574/1213574 [00:01<00:00, 956149.16it/s] 
 27%|██▋       | 6/22 [00:00<00:00, 52.07it/s]

:) Please note that chromosome 1 has 99,864 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 2 has 101,802 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 3 has 84,304 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 4 has 75,938 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 5 has 75,833 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 6 has 81,765 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 7 has 67,142 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 8 has 65,280 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 9 has 56,151 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 10 has 64,689 total GRC

100%|██████████| 22/22 [00:00<00:00, 82.73it/s]

:) Please note that chromosome 14 has 39,885 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 15 has 36,648 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 16 has 37,409 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 17 has 33,578 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 18 has 35,800 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 19 has 23,682 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 20 has 31,532 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 21 has 17,179 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 22 has 17,812 total GRCh37 human variants or SNPs for score file PGS002724
Created per-chromosome SNP coordinate files in




Example command in terminal:

dx upload /Users/sk37/Desktop/another_demo/score_file/per_chrom_snp_files_PGS002724/ranges_group/*.txt --destination IBD_Projects_Saniya:/another_saniya_demo/pgs_snps/ --brief

In [9]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload this score model file to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}pgs_scores/:\n")
command1 = f"dx upload {input_file} --destination {dna_nexus_project_folder}pgs_scores/ --brief"
print(command1)

:) Please run the following command on your terminal after logging into DNAnexus to upload this score model file to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/another_saniya_demo/pgs_scores/:

dx upload /Users/sk37/Desktop/another_demo/score_file/PGS002724.txt --destination IBD_Projects_Saniya:/another_saniya_demo/pgs_scores/ --brief


In [10]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload these score files per chromosome to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}pgs_snps/:\n")
command2 = f"dx upload {subdir1}/*.txt --destination {dna_nexus_project_folder}pgs_snps/ --brief"
print(command2)

:) Please run the following command on your terminal after logging into DNAnexus to upload these score files per chromosome to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/another_saniya_demo/pgs_snps/:

dx upload /Users/sk37/Desktop/another_demo/score_file/per_chrom_snp_files_PGS002724/ranges_group/*.txt --destination IBD_Projects_Saniya:/another_saniya_demo/pgs_snps/ --brief


# Creating the Samplesheet for Step 6 (in advance)

In [None]:
out_file = parent_file + "/samplesheet/"
sampleset = "ukbiobank"
details = f"all_chromosomes_SNPs_merged_{pgs_id}_{genome_build}"
outname = f"pgsc_calc_{pgs_id}_{genome_build}_samplesheet.csv"
details

'all_chromosomes_SNPs_merged_PGS002724_GRCh37'

In [24]:
list_info = [sampleset, details, "", "bfile"]
list_df = pd.DataFrame(list_info).T
list_df.columns = ["sampleset", "path_prefix", "chrom", "format"]
list_df

Unnamed: 0,sampleset,path_prefix,chrom,format
0,ukbiobank,all_chromosomes_SNPs_merged_PGS002724_GRCh37,,bfile


In [25]:
out_fp = out_file + outname
print(out_fp)
list_df.to_csv(out_fp, index = False)
list_df

/Users/sk37/Desktop/another_demo/samplesheet/pgsc_calc_PGS002724_GRCh37_samplesheet.csv


Unnamed: 0,sampleset,path_prefix,chrom,format
0,ukbiobank,all_chromosomes_SNPs_merged_PGS002724_GRCh37,,bfile


In [28]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload the samplesheet for PGSC Calculator to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}:\n")
command3 = f"dx upload {out_fp} --destination {dna_nexus_project_folder} --brief"
print(command3)

:) Please run the following command on your terminal after logging into DNAnexus to upload the samplesheet for PGSC Calculator to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/another_saniya_demo/:

dx upload /Users/sk37/Desktop/another_demo/samplesheet/pgsc_calc_PGS002724_GRCh37_samplesheet.csv --destination IBD_Projects_Saniya:/another_saniya_demo/ --brief
