In [3]:
import pandas as pd
import os
import re
import csv
import time
from urllib.request import urlopen

FD_RES = "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc"

## Test web scraping

In [7]:
url = "https://www.ncbi.nlm.nih.gov/snp/rs112554467"
page = urlopen(url)

html_bytes  = page.read()
html_string = html_bytes.decode("utf-8")

## Test mapping rsid with regex

In [8]:
pattern = '(rs[0-9]+) was merged into.*(rs[0-9]+)'
string  = re.sub('\W+',' ', html_string)

result = re.search(pattern, string)
print(result)
print(result.groups())

<re.Match object; span=(11357, 11425), match='rs112554467 was merged into a target _blank href >
('rs112554467', 'rs2532289')


## Import data

In [12]:
fdiry = os.path.join(FD_RES, "region", "control_tmpra")
fname = "snps_OL13_rsid_list_unmapped.tsv"
fpath = os.path.join(fdiry, fname)

dat_snps_ummapped = pd.read_csv(fpath, sep="\t")
dat_snps_ummapped

Unnamed: 0,SNP,Chrom,Allele_Ref,Allele_Alt,Loc_hg19,Loc_hg38
0,rs112554467,,,,,
1,rs113065267,,,,,
2,rs115014431,,,,,
3,rs116025516,,,,,
4,rs116100242,,,,,
5,rs117188042,,,,,
6,rs138272216,,,,,
7,rs139474644,,,,,
8,rs140535129,,,,,
9,rs141388053,,,,,


## Mapping all remain rsid

In [13]:
rsids = dat_snps_ummapped.SNP
rsids

0     rs112554467
1     rs113065267
2     rs115014431
3     rs116025516
4     rs116100242
5     rs117188042
6     rs138272216
7     rs139474644
8     rs140535129
9     rs141388053
10    rs144392153
11    rs149812839
12    rs150242986
13    rs150594649
Name: SNP, dtype: object

In [14]:
pattern = '(rs[0-9]+) was merged into.*(rs[0-9]+)'
sources = []
targets = []

for rsid in rsids:
    ###
    url  = "https://www.ncbi.nlm.nih.gov/snp/" + rsid
    page = urlopen(url)
    
    ###
    html_bytes  = page.read()
    html_string = html_bytes.decode("utf-8")
    
    ###
    string = re.sub('\W+',' ', html_string)
    result = re.search(pattern, string)
    
    ###
    if result:
        ### show progress
        print(result.groups())
        print("======================")
        
        ### save
        sources.append(result.group(1))
        targets.append(result.group(2))
    else:
        print("No Matches")
        
    ### avoid too many request in a short time
    time.sleep(1)

('rs112554467', 'rs2532289')
('rs113065267', 'rs2864087')
('rs115014431', 'rs6940956')
('rs116025516', 'rs9260119')
('rs116100242', 'rs1526126')
('rs117188042', 'rs62055887')
('rs138272216', 'rs61797307')
('rs139474644', 'rs56392172')
('rs140535129', 'rs62074566')
('rs141388053', 'rs2458206')
('rs144392153', 'rs2693353')
('rs149812839', 'rs2906006')
('rs150242986', 'rs61891158')
('rs150594649', 'rs2696610')


In [15]:
for rsid1, rsid2, rsid3 in zip(rsids, sources, targets):
    print(rsid1 == rsid2, rsid1, rsid2, rsid3)

True rs112554467 rs112554467 rs2532289
True rs113065267 rs113065267 rs2864087
True rs115014431 rs115014431 rs6940956
True rs116025516 rs116025516 rs9260119
True rs116100242 rs116100242 rs1526126
True rs117188042 rs117188042 rs62055887
True rs138272216 rs138272216 rs61797307
True rs139474644 rs139474644 rs56392172
True rs140535129 rs140535129 rs62074566
True rs141388053 rs141388053 rs2458206
True rs144392153 rs144392153 rs2693353
True rs149812839 rs149812839 rs2906006
True rs150242986 rs150242986 rs61891158
True rs150594649 rs150594649 rs2696610


## Save results

In [19]:
fdiry = os.path.join(FD_RES, "region", "control_tmpra")
fname = "snps_OL13_rsid_mapping.csv"
fpath = os.path.join(fdiry, fname)
print(fpath)

dat = pd.DataFrame({'SNP':sources,'RSID':targets})
dat.to_csv(fpath, index=False)

/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/region/control_tmpra/snps_OL13_rsid_mapping.csv


In [20]:
fdiry = os.path.join(FD_RES, "region", "control_tmpra")
fname = "snps_OL13_rsid_list_mapped.txt"
fpath = os.path.join(fdiry, fname)

dat = pd.DataFrame({'RSID':targets})
dat.to_csv(fpath, index=False, header=False)