# Step 2:
## Extracting SNPs (Variants) from PGS Catalog Score Files
### By: Saniya Khullar :)

Please verify that all of the score files have the same reference build (e.g. Hg19/GRCh37 or Hg38/GRCh38)

In [1]:
import os
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict
import math


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parent_file = "/Users/sk37/Desktop/newest_demo/"
dna_nexus_project_folder = "IBD_Projects_Saniya:/combined_saniya_demo/" # the main folder we are working with in dnanexus
snp_threshold = 8000  # max SNPs per file
scores_fp = parent_file + "score_file/"

In [3]:
scores_list = os.listdir(scores_fp)
score_files_list = []
for score_file in scores_list:
    if ".txt" in score_file:
        score_files_list.append(score_file)
score_files_list

['PGS003725.txt', 'PGS005097.txt', 'PGS002724.txt']

In [4]:
len(os.listdir("/Users/sk37/Desktop/newest_demo/score_file/per_chrom_snp_files_GRCh37_combined_PGS002724_PGS003725_PGS005097/ranges_group/"))

184

In [5]:
for score_name in score_files_list:
    input_file = scores_fp + score_name
    dir_name = os.path.dirname(os.path.abspath(input_file))
    print(dir_name)

    # Read file
    with open(input_file, "r") as f:
        lines = f.readlines()

    # Extract pgs_id and genome_build from header
    pgs_id = None
    genome_build = None
    for line in lines:
        if line.startswith("#pgs_id="):
            pgs_id = line.strip().split("=")[1]
        elif line.startswith("#genome_build="):
            genome_build = line.strip().split("=")[1]

    if not pgs_id or not genome_build:
        raise ValueError("pgs_id or genome_build missing in file header.")
    print(f":) Please note: pgs_id = {pgs_id} and genome_build = {genome_build}")

    # Extract SNP table (start at header "rsID")
    snp_lines = []
    header_found = False
    for line in tqdm(lines, desc = ":) going through each line to find start of score data"):
        if line.startswith("#") == False:
        # if line.startswith("rsID"):
            header_found = True
            snp_lines.append(line.strip())  # include header
            continue
        if header_found:
            if line.strip() and not line.startswith("#"):
                snp_lines.append(line.strip())

    # Save SNP table to TSV
    output_tsv = os.path.join(dir_name, f"{pgs_id}_SNPs_{genome_build}.tsv")
    with open(output_tsv, "w") as f:
        f.write("\n".join(snp_lines) + "\n")

    print(f"Saved SNP table to {output_tsv}")

/Users/sk37/Desktop/newest_demo/score_file
:) Please note: pgs_id = PGS003725 and genome_build = hg19


:) going through each line to find start of score data: 100%|██████████| 1296187/1296187 [00:00<00:00, 3144269.50it/s]


Saved SNP table to /Users/sk37/Desktop/newest_demo/score_file/PGS003725_SNPs_hg19.tsv
/Users/sk37/Desktop/newest_demo/score_file
:) Please note: pgs_id = PGS005097 and genome_build = GRCh37


:) going through each line to find start of score data: 100%|██████████| 1274707/1274707 [00:00<00:00, 2966352.18it/s]


Saved SNP table to /Users/sk37/Desktop/newest_demo/score_file/PGS005097_SNPs_GRCh37.tsv
/Users/sk37/Desktop/newest_demo/score_file
:) Please note: pgs_id = PGS002724 and genome_build = GRCh37


:) going through each line to find start of score data: 100%|██████████| 1213589/1213589 [00:00<00:00, 3435525.99it/s]


Saved SNP table to /Users/sk37/Desktop/newest_demo/score_file/PGS002724_SNPs_GRCh37.tsv


In [6]:
scores_list = os.listdir(scores_fp)
score_files_list = []
for score_file in scores_list:
    if ".tsv" in score_file:
        score_files_list.append(score_file)
score_files_list

['PGS002724_SNPs_GRCh37.tsv',
 'PGS003725_SNPs_hg19.tsv',
 'PGS005097_SNPs_GRCh37.tsv']

In [7]:
df_list = []
for score_name in tqdm(score_files_list):
    input_file = scores_fp + score_name
    df = pd.read_csv(input_file, sep = "\t")
    df_list.append(df)
combined = pd.concat(df_list, axis=0, join='outer', ignore_index=True)
combined


100%|██████████| 3/3 [00:01<00:00,  2.65it/s]


Unnamed: 0,chr_name,chr_position,effect_allele,other_allele,effect_weight,rsID
0,1,752721,G,A,50.200914,
1,1,754182,G,A,141.073654,
2,1,760912,T,C,180.556537,
3,1,768448,A,G,-74.643825,
4,1,779322,G,A,-137.024959,
...,...,...,...,...,...,...
3784433,9,141017562,G,A,-0.000021,rs9410163
3784434,9,141018423,T,C,-0.000033,rs9777369
3784435,9,141023914,G,A,0.000006,rs12335803
3784436,9,141025328,A,C,0.000098,rs11137376


In [51]:
scores_list_names = '_'.join([i.split("_")[0] for i in score_files_list])
scores_list_names

'PGS002724_PGS003725_PGS005097'

In [59]:
total_unique_variants_df = combined[["chr_name", "chr_position"]].drop_duplicates()
print(f":) Please note that there are {total_unique_variants_df.shape[0]:,} unique {genome_build} variants combined across these {len(score_files_list)} score files: {scores_list_names}")
total_unique_variants_df

:) Please note that there are 1,389,069 unique GRCh37 variants combined across these 3 score files: PGS002724_PGS003725_PGS005097


Unnamed: 0,chr_name,chr_position
0,1,752721
1,1,754182
2,1,760912
3,1,768448
4,1,779322
...,...,...
3749895,9,72940970
3752340,9,78232311
3757233,9,87552671
3762264,9,98069071


In [50]:
combined

Unnamed: 0,chr_name,chr_position,effect_allele,other_allele,effect_weight,rsID
0,1,752721,G,A,50.200914,
1,1,754182,G,A,141.073654,
2,1,760912,T,C,180.556537,
3,1,768448,A,G,-74.643825,
4,1,779322,G,A,-137.024959,
...,...,...,...,...,...,...
3784433,9,141017562,G,A,-0.000021,rs9410163
3784434,9,141018423,T,C,-0.000033,rs9777369
3784435,9,141023914,G,A,0.000006,rs12335803
3784436,9,141025328,A,C,0.000098,rs11137376


In [53]:
# Create subdirectory for per-chromosome files
subdir = os.path.join(dir_name, f"per_chrom_snp_files_{genome_build}_combined_{scores_list_names}")
subdir1 = os.path.join(subdir, "ranges_group")

os.makedirs(subdir, exist_ok=True)
os.makedirs(subdir1, exist_ok=True)

# Ordered dicts for uniqueness while preserving input order
chrom_ranges = defaultdict(dict)

chrom_found_list = []

# Collect SNPs
# Parse header dynamically
header = combined.columns.tolist()
chr_name_idx = header.index("chr_name")
chr_pos_idx = header.index("chr_position")

snp_lines_to_use = combined.values.tolist()
# Collect SNPs
for row in tqdm(snp_lines_to_use):  # skip header
    cols = [str(i) for i in row]#.strip().split("\t")
    chr_name = cols[chr_name_idx]
    chr_pos = int(cols[chr_pos_idx])

    # Format chromosome with leading zero if < 10
    chr_fmt = f"{int(chr_name):02d}" if chr_name.isdigit() and int(chr_name) < 10 else chr_name

    chrom_found_list.append(chr_name)

    # Use dict keys for uniqueness, keeps insertion order
    chrom_ranges[chr_name][f"{chr_fmt}:{chr_pos}-{chr_pos}"] = None


100%|██████████| 3784438/3784438 [00:06<00:00, 600855.89it/s]


In [None]:
# Write unique SNPs per chromosome, split into multiple files if needed
for chr_name, snps_dict in tqdm(chrom_ranges.items()):
    snps_list = list(snps_dict.keys())
    total_snps = len(snps_list)
    print(f":) Please note that chromosome {chr_name} has {total_snps:,} total {genome_build} human variants or SNPs for score file {pgs_id}")
    if total_snps <= snp_threshold:
        # Single file
        chr_file_ranges = os.path.join(subdir1, f"{pgs_id}_chr{chr_name}_{genome_build}_ranges.txt")
        with open(chr_file_ranges, "w") as cf:
            for entry in snps_list:
                cf.write(entry + "\n")
    else:
        # Split into multiple files
        num_parts = math.ceil(total_snps / snp_threshold)
        for i in range(num_parts):
            start_idx = i * snp_threshold
            end_idx = min(start_idx + snp_threshold, total_snps)
            part_snps = snps_list[start_idx:end_idx]
            
            chr_file_ranges = os.path.join(
                subdir1, 
                f"{scores_list_names}_chr{chr_name}_{genome_build}_ranges_p{i+1}_of_{num_parts}.txt"
            )
            with open(chr_file_ranges, "w") as cf:
                for entry in part_snps:
                    cf.write(entry + "\n")

print(f"Created per-chromosome SNP coordinate files in {subdir}")


 23%|██▎       | 5/22 [00:00<00:00, 47.87it/s]

:) Please note that chromosome 1 has 116,289 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 2 has 116,914 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 3 has 96,703 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 4 has 86,886 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 5 has 86,321 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 6 has 92,081 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 7 has 75,776 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 8 has 74,395 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 9 has 63,411 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 10 has 73,835 total GR

100%|██████████| 22/22 [00:00<00:00, 76.44it/s]

:) Please note that chromosome 15 has 42,245 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 16 has 43,488 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 17 has 38,939 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 18 has 41,035 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 19 has 27,257 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 20 has 36,091 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 21 has 19,273 total GRCh37 human variants or SNPs for score file PGS002724
:) Please note that chromosome 22 has 20,271 total GRCh37 human variants or SNPs for score file PGS002724
Created per-chromosome SNP coordinate files in /Users/sk37/Desktop/newest_demo/score_file/per_chrom_snp_files_GRCh37_combined_PGS002724_PGS003725_PGS005




Example command in terminal:

dx upload /Users/sk37/Desktop/another_demo/score_file/per_chrom_snp_files_PGS005097/ranges_group/*.txt --destination IBD_Projects_Saniya:/second_saniya_demo/pgs_snps/ --brief

In [None]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload this score model file to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}pgs_scores/:\n")
command1 = f"dx upload {scores_fp + '*.txt'} --destination {dna_nexus_project_folder}pgs_scores/ --brief"
print(command1)

:) Please run the following command on your terminal after logging into DNAnexus to upload this score model file to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/combined_saniya_demo/pgs_scores/:

dx upload /Users/sk37/Desktop/newest_demo/score_file/*.txt --destination IBD_Projects_Saniya:/combined_saniya_demo/pgs_scores/ --brief


In [None]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload these score files per chromosome to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}pgs_snps/:\n")
command2 = f"dx upload {subdir1}/*.txt --destination {dna_nexus_project_folder}pgs_snps/ --brief"
print(command2)

:) Please run the following command on your terminal after logging into DNAnexus to upload these score files per chromosome to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/combined_saniya_demo/pgs_snps/:

dx upload /Users/sk37/Desktop/newest_demo/score_file/per_chrom_snp_files_GRCh37_combined_PGS002724_PGS003725_PGS005097/ranges_group/*.txt --destination IBD_Projects_Saniya:/combined_saniya_demo/pgs_snps/ --brief


# Creating the Samplesheet for Step 6 (in advance)

In [37]:
out_file = parent_file + "samplesheet/"
sampleset = "ukbiobank"
details = f"all_chromosomes_SNPs_merged_{scores_list_names}_{genome_build}"
outname = f"pgsc_calc_{scores_list_names}_{genome_build}_samplesheet.csv"
details

'all_chromosomes_SNPs_merged_PGS002724_PGS003725_PGS005097_GRCh37'

In [38]:
list_info = [sampleset, details, "", "bfile"]
list_df = pd.DataFrame(list_info).T
list_df.columns = ["sampleset", "path_prefix", "chrom", "format"]
list_df

Unnamed: 0,sampleset,path_prefix,chrom,format
0,ukbiobank,all_chromosomes_SNPs_merged_PGS002724_PGS00372...,,bfile


In [39]:
out_fp = out_file + outname
print(out_fp)
list_df.to_csv(out_fp, index = False)
list_df

/Users/sk37/Desktop/newest_demo/samplesheet/pgsc_calc_PGS002724_PGS003725_PGS005097_GRCh37_samplesheet.csv


Unnamed: 0,sampleset,path_prefix,chrom,format
0,ukbiobank,all_chromosomes_SNPs_merged_PGS002724_PGS00372...,,bfile


In [54]:
print(":) Please run the following command on your terminal after logging into DNAnexus to upload the samplesheet for PGSC Calculator to ")
print(f" the following folder on DNAnexus portal for UK Biobank:  {dna_nexus_project_folder}:\n")
command3 = f"dx upload {out_fp} --destination {dna_nexus_project_folder} --brief"
print(command3)

:) Please run the following command on your terminal after logging into DNAnexus to upload the samplesheet for PGSC Calculator to 
 the following folder on DNAnexus portal for UK Biobank:  IBD_Projects_Saniya:/combined_saniya_demo/:

dx upload /Users/sk37/Desktop/newest_demo/samplesheet/pgsc_calc_PGS002724_PGS003725_PGS005097_GRCh37_samplesheet.csv --destination IBD_Projects_Saniya:/combined_saniya_demo/ --brief
