In [5]:
import re
import requests
from bs4 import BeautifulSoup
import urllib3
bacteria_regex = r'(?P<bacterium>[A-Z]\. [a-z]+)(?P<strain>\s?[A-Z]+\s?[0-9]+)?'

In [6]:
def get_sequence(sequence_string):
    return sequence_string

def get_references(reference_string, reference_link):
    reference = reference_string
    if reference_link:
        reference += ' | ' + reference_link
    return [reference]

def get_data(data_string):

    all_bacteria = {}
    unit_strings = ['uM', 'ug/ml']
    unit_delimiters = [u + ')' for u in unit_strings]
    unit_regex_pattern = '(' + '|'.join(map(re.escape, unit_delimiters)) + ')'
    fields = re.split(unit_regex_pattern, data_string)
    fields_and_units = []
    for i, f in enumerate(fields[:-1]):  # Parentheses around regex mean match expression is also returned
        # Up to -1 to skip the last one, which has no corresponding unit match
        if i % 2 == 0:
            fields_and_units.append({'bacteria_string': f, 'unit': fields[i+1]})

    for field_and_unit in fields_and_units:

        bacteria_string = field_and_unit['bacteria_string']
        unit = field_and_unit['unit']
        
        bacteria_matches = re.finditer(bacteria_regex, bacteria_string)
        
        def _extract_value(range_expr):
            if '-' not in range_expr:
                return range_expr
            bounds = range_expr.split('-')

            def _geometric_mean(b0, b1):
                return (b0 * b1) ** (0.5)

            try:
                b0, b1 = float(bounds[0]), float(bounds[1])
                return str(_geometric_mean(b0, b1))  # Take geometric mean (log scale)
            except:
                print("?????")
                return bounds[0]
                
        
        numeric_range_regex = r'MIC \d+\.?\s?\-?\s?\d*'
        numeric_match = re.search(numeric_range_regex, bacteria_string)
        if not numeric_match:
            continue
        mic_match_string = numeric_match.group(0)
        numeric_part = mic_match_string[4:]
        value = _extract_value(numeric_part)
        for bacteria_match in bacteria_matches:
            bacterium = bacteria_match.groupdict()['bacterium']
            strain = bacteria_match.groupdict()['strain']
            if strain:
                strain = strain.strip()
            all_bacteria[(bacterium, strain)] = {
                'unit': unit,
                'value': value,
            }
            
    return all_bacteria

def get_modifications(modifications_string):
    modifications = []
    if re.match(r'XX[B-Z]', modifications_string):
        modifications.append('unknown_modification: XX[B-Z]')
    for bad_string in ['UCBB', 'UCSB', 'UCSS1b']:
        if bad_string in modifications_string:
            modifications.append('unknown_modification: ' + bad_string)
    if 'UCSS1a' in modifications_string or 'S=S' in modifications_string or 'S-S' in modifications_string:
        modifications.append('disulfide')
    if 'XXA' in modifications_string:
        modifications.append('C-Terminal')
    return modifications

In [7]:
DATABASE_NAME = 'APD'
url_base = 'http://aps.unmc.edu/AP/database/query_output.php?ID='

NUM_AMPS = 2967
amps = {}

In [9]:
for i in range(max(len(amps), 1), NUM_AMPS):
    if i == 2359:
        continue  # Their page is messed up for this one
    
    url = url_base + str(i)

    urllib3.disable_warnings()  # 屏蔽 SSL 警告

    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    content = requests.get(url, headers=headers, verify=False).content

    soup = BeautifulSoup(content, 'html.parser')

    table = soup.find('table')
    if not table:
        print(f"[Warning] No <table> found at URL: {url}")
        continue  # 跳过这个页面

    cells = [
        column for row in table.findAll('td')
        for column in row.findAll('td')
    ]
    cell_texts = [cell.text.strip() for cell in cells]

    sequence_string = cell_texts[cell_texts.index('Sequence:') + 1]
    sequence = get_sequence(sequence_string)
    if sequence in amps:
        print('Duplicate sequence found:', sequence, i)

    data_string = cell_texts[cell_texts.index('Additional info:') + 1]
    data = get_data(data_string)

    modifications_string = cell_texts[cell_texts.index('Name/Class:') + 1]
    modifications = get_modifications(modifications_string)

    reference_cell = cells[cell_texts.index('Reference:') + 1]
    reference_string = reference_cell.text.strip()
    reference_link = reference_cell.find('a').get('href') if reference_cell.find('a') else ''
    references = get_references(reference_string, reference_link)

    amps[sequence] = {}
    amps[sequence]['bacteria'] = data
    amps[sequence]['references'] = references
    amps[sequence]['modifications'] = modifications
    amps[sequence]['url_sources'] = [url]



KeyboardInterrupt: 

In [None]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))