# Step 2:
## Extracting SNPs (Variants) from PGS Catalog Score Files
### By: Saniya Khullar :)

In [16]:
import os
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict

In [17]:
input_file = "/Users/sk37/Desktop/saniya_demo/score_file/PGS000053.txt"
dir_name = os.path.dirname(os.path.abspath(input_file))
dir_name

'/Users/sk37/Desktop/saniya_demo/score_file'

In [18]:
# Read file
with open(input_file, "r") as f:
    lines = f.readlines()

# Extract pgs_id and genome_build from header
pgs_id = None
genome_build = None
for line in lines:
    if line.startswith("#pgs_id="):
        pgs_id = line.strip().split("=")[1]
    elif line.startswith("#genome_build="):
        genome_build = line.strip().split("=")[1]

if not pgs_id or not genome_build:
    raise ValueError("pgs_id or genome_build missing in file header.")
print(f":) Please note: pgs_id = {pgs_id} and genome_build = {genome_build}")


:) Please note: pgs_id = PGS000053 and genome_build = GRCh37


In [19]:
# Extract SNP table (start at header "rsID")
snp_lines = []
header_found = False
for line in tqdm(lines, desc = ":) going through each line to find start of score data"):
    if line.startswith("rsID"):
        header_found = True
        snp_lines.append(line.strip())  # include header
        continue
    if header_found:
        if line.strip() and not line.startswith("#"):
            snp_lines.append(line.strip())

# Save SNP table to TSV
output_tsv = os.path.join(dir_name, f"{pgs_id}_SNPs_{genome_build}.tsv")
with open(output_tsv, "w") as f:
    f.write("\n".join(snp_lines) + "\n")

print(f"Saved SNP table to {output_tsv}")

:) going through each line to find start of score data: 100%|██████████| 36/36 [00:00<00:00, 247939.15it/s]

Saved SNP table to /Users/sk37/Desktop/saniya_demo/score_file/PGS000053_SNPs_GRCh37.tsv





In [31]:
import os
from collections import defaultdict

# Create subdirectory for per-chromosome files
subdir = os.path.join(dir_name, f"per_chrom_snp_files_{pgs_id}")
subdir1 = os.path.join(subdir, "ranges")
subdir2 = os.path.join(subdir, "rsids")

os.makedirs(subdir, exist_ok=True)
os.makedirs(subdir1, exist_ok=True)
os.makedirs(subdir2, exist_ok=True)

# Ordered dicts for uniqueness while preserving input order
chrom_ranges = defaultdict(dict)
chrom_rsids = defaultdict(dict)

chrom_found_list = []

# Collect SNPs
for row in snp_lines[1:]:  # skip header
    rsID, chr_name, chr_pos, *_ = row.split("\t")
    chr_pos = int(chr_pos)

    # Format chromosome with leading zero if < 10
    if chr_name.isdigit() and int(chr_name) < 10:
        chr_fmt = f"{int(chr_name):02d}"
    else:
        chr_fmt = chr_name

    chrom_found_list.append(chr_name)

    # Use dict keys for uniqueness, keeps insertion order
    chrom_ranges[chr_name][f"{chr_fmt}:{chr_pos}-{chr_pos}"] = None
    chrom_rsids[chr_name][rsID] = None

# Write unique SNPs per chromosome (in original input order)
for chr_name in chrom_ranges:
    chr_file_ranges = os.path.join(subdir1, f"{pgs_id}_chr{chr_name}_{genome_build}_ranges.txt")
    chr_file_rsids = os.path.join(subdir2, f"{pgs_id}_chr{chr_name}_{genome_build}_rsids.txt")

    with open(chr_file_ranges, "w") as cf:
        for entry in chrom_ranges[chr_name].keys():
            cf.write(entry + "\n")

    with open(chr_file_rsids, "w") as cf:
        for entry in chrom_rsids[chr_name].keys():
            cf.write(entry + "\n")

print(f"Created per-chromosome SNP coordinate files in {subdir}")


Created per-chromosome SNP coordinate files in /Users/sk37/Desktop/saniya_demo/score_file/per_chrom_snp_files_PGS000053


In [21]:
print(f":) before {chrom_found_list}")
chrom_found_list = list(set(chrom_found_list))
chrom_found_list = [int(i) for i in chrom_found_list]
chrom_found_list.sort()
print(f":) after {chrom_found_list}")

:) before ['1', '2', '2', '5', '6', '6', '7', '7', '7', '8', '8', '11', '11', '11', '11', '14', '14', '18', '19', '19', '20']
:) after [1, 2, 5, 6, 7, 8, 11, 14, 18, 19, 20]


In [22]:
chrom_found_df = pd.DataFrame(list(set(chrom_found_list)))
chrom_found_df.columns = ["chr_name"]
chrom_found_df = chrom_found_df.drop_duplicates()
chrom_found_df

Unnamed: 0,chr_name
0,1
1,2
2,5
3,6
4,7
5,8
6,11
7,14
8,18
9,19


In [23]:
chr_nums_outpath = os.path.join(subdir, f"chromNames_{pgs_id}.txt")
chr_nums_outpath

'/Users/sk37/Desktop/saniya_demo/score_file/per_chrom_snp_files_PGS000053/chromNames_PGS000053.txt'

In [24]:
chrom_found_df.to_csv(chr_nums_outpath, header = None, index = None)

### Please create the name of all of the bed files that we will use for merging later on (Step 5):


In [25]:
bed_file_outname = "all_subset_variants_PGS000053_GRCh37_chr"

In [26]:
chrom_found_list

[1, 2, 5, 6, 7, 8, 11, 14, 18, 19, 20]

In [27]:
merge_filepaths_list = []
for chrom in chrom_found_list:
    outname_full = "all_subset_variants_PGS000053_GRCh37_chr" + str(chrom)
    bed_name = outname_full + ".bed"
    bim_name = outname_full + ".bim"
    fam_name = outname_full + ".fam"
    list_to_add = [bed_name, bim_name, fam_name]
    merge_filepaths_list.append(list_to_add)
print(merge_filepaths_list)

[['all_subset_variants_PGS000053_GRCh37_chr1.bed', 'all_subset_variants_PGS000053_GRCh37_chr1.bim', 'all_subset_variants_PGS000053_GRCh37_chr1.fam'], ['all_subset_variants_PGS000053_GRCh37_chr2.bed', 'all_subset_variants_PGS000053_GRCh37_chr2.bim', 'all_subset_variants_PGS000053_GRCh37_chr2.fam'], ['all_subset_variants_PGS000053_GRCh37_chr5.bed', 'all_subset_variants_PGS000053_GRCh37_chr5.bim', 'all_subset_variants_PGS000053_GRCh37_chr5.fam'], ['all_subset_variants_PGS000053_GRCh37_chr6.bed', 'all_subset_variants_PGS000053_GRCh37_chr6.bim', 'all_subset_variants_PGS000053_GRCh37_chr6.fam'], ['all_subset_variants_PGS000053_GRCh37_chr7.bed', 'all_subset_variants_PGS000053_GRCh37_chr7.bim', 'all_subset_variants_PGS000053_GRCh37_chr7.fam'], ['all_subset_variants_PGS000053_GRCh37_chr8.bed', 'all_subset_variants_PGS000053_GRCh37_chr8.bim', 'all_subset_variants_PGS000053_GRCh37_chr8.fam'], ['all_subset_variants_PGS000053_GRCh37_chr11.bed', 'all_subset_variants_PGS000053_GRCh37_chr11.bim', 'all

In [28]:
merge_filepaths_df = pd.DataFrame(merge_filepaths_list)
merge_filepaths_df

Unnamed: 0,0,1,2
0,all_subset_variants_PGS000053_GRCh37_chr1.bed,all_subset_variants_PGS000053_GRCh37_chr1.bim,all_subset_variants_PGS000053_GRCh37_chr1.fam
1,all_subset_variants_PGS000053_GRCh37_chr2.bed,all_subset_variants_PGS000053_GRCh37_chr2.bim,all_subset_variants_PGS000053_GRCh37_chr2.fam
2,all_subset_variants_PGS000053_GRCh37_chr5.bed,all_subset_variants_PGS000053_GRCh37_chr5.bim,all_subset_variants_PGS000053_GRCh37_chr5.fam
3,all_subset_variants_PGS000053_GRCh37_chr6.bed,all_subset_variants_PGS000053_GRCh37_chr6.bim,all_subset_variants_PGS000053_GRCh37_chr6.fam
4,all_subset_variants_PGS000053_GRCh37_chr7.bed,all_subset_variants_PGS000053_GRCh37_chr7.bim,all_subset_variants_PGS000053_GRCh37_chr7.fam
5,all_subset_variants_PGS000053_GRCh37_chr8.bed,all_subset_variants_PGS000053_GRCh37_chr8.bim,all_subset_variants_PGS000053_GRCh37_chr8.fam
6,all_subset_variants_PGS000053_GRCh37_chr11.bed,all_subset_variants_PGS000053_GRCh37_chr11.bim,all_subset_variants_PGS000053_GRCh37_chr11.fam
7,all_subset_variants_PGS000053_GRCh37_chr14.bed,all_subset_variants_PGS000053_GRCh37_chr14.bim,all_subset_variants_PGS000053_GRCh37_chr14.fam
8,all_subset_variants_PGS000053_GRCh37_chr18.bed,all_subset_variants_PGS000053_GRCh37_chr18.bim,all_subset_variants_PGS000053_GRCh37_chr18.fam
9,all_subset_variants_PGS000053_GRCh37_chr19.bed,all_subset_variants_PGS000053_GRCh37_chr19.bim,all_subset_variants_PGS000053_GRCh37_chr19.fam


In [29]:
merge_filepaths_output_name = f"prep_files/bed_files_to_merge_{pgs_id}_{genome_build}_step5.txt"
merge_filepaths_output_name

'prep_files/bed_files_to_merge_PGS000053_GRCh37_step5.txt'

In [30]:
merge_filepaths_df.to_csv(merge_filepaths_output_name, header = None, index=False, sep = "\t")