In [2]:
import re
import csv
import time
from urllib.request import urlopen

## Test web scraping

In [3]:
url = "https://www.ncbi.nlm.nih.gov/snp/rs112554467"
page = urlopen(url)

html_bytes  = page.read()
html_string = html_bytes.decode("utf-8")

## Mapping rsid with regex

In [6]:
pattern = '(rs[0-9]+) was merged into.*(rs[0-9]+)'
string  = re.sub('\W+',' ', html_string)

result = re.search(pattern, string)
print(result)
print(result.groups())

<re.Match object; span=(11357, 11425), match='rs112554467 was merged into a target _blank href >
('rs112554467', 'rs2532289')


## Mapping all remain rsid

In [8]:
rsids = ['rs112554467','rs113065267','rs115014431','rs116025516','rs116100242','rs117188042','rs138272216','rs139474644','rs140535129','rs141388053','rs144392153','rs149812839','rs150242986','rs150594649']
rsids

['rs112554467',
 'rs113065267',
 'rs115014431',
 'rs116025516',
 'rs116100242',
 'rs117188042',
 'rs138272216',
 'rs139474644',
 'rs140535129',
 'rs141388053',
 'rs144392153',
 'rs149812839',
 'rs150242986',
 'rs150594649']

In [9]:
pattern = '(rs[0-9]+) was merged into.*(rs[0-9]+)'
sources = []
targets = []

for rsid in rsids:
    ###
    url  = "https://www.ncbi.nlm.nih.gov/snp/" + rsid
    page = urlopen(url)
    
    ###
    html_bytes  = page.read()
    html_string = html_bytes.decode("utf-8")
    
    ###
    string = re.sub('\W+',' ', html_string)
    result = re.search(pattern, string)
    
    ###
    if result:
        ### show progress
        print(result.groups())
        print("======================")
        
        ### save
        sources.append(result.group(1))
        targets.append(result.group(2))
    else:
        print("No Matches")
        
    ### avoid too many request in a short time
    time.sleep(1)

('rs112554467', 'rs2532289')
('rs113065267', 'rs2864087')
('rs115014431', 'rs6940956')
('rs116025516', 'rs9260119')
('rs116100242', 'rs1526126')
('rs117188042', 'rs62055887')
('rs138272216', 'rs61797307')
('rs139474644', 'rs56392172')
('rs140535129', 'rs62074566')
('rs141388053', 'rs2458206')
('rs144392153', 'rs2693353')
('rs149812839', 'rs2906006')
('rs150242986', 'rs61891158')
('rs150594649', 'rs2696610')


In [10]:
for rsid1, rsid2, rsid3 in zip(rsids, sources, targets):
    print(rsid1 == rsid2, rsid1, rsid2, rsid3)

True rs112554467 rs112554467 rs2532289
True rs113065267 rs113065267 rs2864087
True rs115014431 rs115014431 rs6940956
True rs116025516 rs116025516 rs9260119
True rs116100242 rs116100242 rs1526126
True rs117188042 rs117188042 rs62055887
True rs138272216 rs138272216 rs61797307
True rs139474644 rs139474644 rs56392172
True rs140535129 rs140535129 rs62074566
True rs141388053 rs141388053 rs2458206
True rs144392153 rs144392153 rs2693353
True rs149812839 rs149812839 rs2906006
True rs150242986 rs150242986 rs61891158
True rs150594649 rs150594649 rs2696610


In [11]:
for rsid in targets:
    print(rsid)

rs2532289
rs2864087
rs6940956
rs9260119
rs1526126
rs62055887
rs61797307
rs56392172
rs62074566
rs2458206
rs2693353
rs2906006
rs61891158
rs2696610


## Save results into table

In [12]:
import os

In [13]:
fdiry = os.path.join("/data/reddylab/Kuei/out/proj_combeffect_encode_fcc", "region", "control_tmpra")
fname = "snps_OL13_rsid_mapping.csv"
fpath = os.path.join(fdiry, fname)
print(fpath)

/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/region/control_tmpra/snps_OL13_rsid_mapping.csv


In [15]:
for rsid1, rsid2 in zip(sources, targets):
    row = ",".join([rsid1, rsid2])
    print(row)

rs112554467,rs2532289
rs113065267,rs2864087
rs115014431,rs6940956
rs116025516,rs9260119
rs116100242,rs1526126
rs117188042,rs62055887
rs138272216,rs61797307
rs139474644,rs56392172
rs140535129,rs62074566
rs141388053,rs2458206
rs144392153,rs2693353
rs149812839,rs2906006
rs150242986,rs61891158
rs150594649,rs2696610


In [17]:
# open the file in the write mode
with open(fpath, 'w') as file:
    
    # create the csv writer
    writer = csv.writer(file)
    
    # write header to the csv file
    cnames = ["RSID_OLD", "RSID_NEW"]
    writer.writerow(cnames)
    
    # write to file
    for row in zip(sources, targets):
    
        # write a row to the csv file
        writer.writerow(row)

In [84]:
text = 'gfgfdAAA1234ZZZuijjk'

m = re.search('AAA(.+?)ZZZ', text)
if m:
    found = m.group(1)

In [64]:
string

' DOCTYPE html html lang en head meta charset UTF 8 meta http equiv X UA Compatible content IE edge Mobile properties meta name HandheldFriendly content True meta name MobileOptimized content 320 meta name viewport content width device width initial scale 1 0 Stylesheets link href snp static django_uswds uswds css uswds css rel stylesheet link rel stylesheet href snp static nwds css nwds css link rel stylesheet href snp static nwds css header css link rel stylesheet href snp static nwds css footer css link rel stylesheet href snp static nwds css form css link rel stylesheet href https maxcdn bootstrapcdn com font awesome 4 7 0 css font awesome min css title rs112554467 RefSNP Report dbSNP NCBI title Favicons link rel shortcut icon type image ico href https www ncbi nlm nih gov coreutils nwds img favicons favicon ico link rel icon type image png href https www ncbi nlm nih gov coreutils nwds img favicons favicon png 192x192 as recommended for Android http updates html5rocks com 2014 11 