In [1]:
import re
from lxml import html
import pandas as pd

In [2]:
# Load raw HTML
raw_html = open("../html_xml_samples/PMC/Recurrent WNT pathway alterations are frequent in relapsed small cell lung cancer - PMC        .html", "r").read()

In [3]:
tree = html.fromstring(raw_html)

supplementary_links = []

anchors = tree.xpath("//a[@data-ga-action='click_feat_suppl']")

for anchor in anchors:
    href = anchor.get("href")
    title = anchor.text_content().strip()

    # Extract ALL attributes from <a>
    anchor_attributes = anchor.attrib  # This gives you a dictionary of all attributes

    # Get <sup> sibling for file size/type info
    sup = anchor.getparent().xpath("./sup")
    file_info = sup[0].text_content().strip() if sup else "n/a"

    # Get <p> description if exists
    p_desc = anchor.getparent().xpath("./p")
    description = p_desc[0].text_content().strip() if p_desc else "n/a"

    # Extract attributes from parent <section> for context
    section = anchor.getparent().getparent()  # Assuming structure stays the same
    section_id = section.get('id', 'n/a')
    section_class = section.get('class', 'n/a')

    # Combine all extracted info
    link_data = {
        'link': href,
        'title': title,
        'file_info': file_info,
        'description': description,
        'section_id': section_id,
        'section_class': section_class,
    }

    # Merge anchor attributes (prefix keys to avoid collision)
    for attr_key, attr_value in anchor_attributes.items():
        link_data[f'a_attr_{attr_key}'] = attr_value

    supplementary_links.append(link_data)

# Convert to DataFrame
df_supp = pd.DataFrame(supplementary_links)

# Drop duplicates based on link
df_supp = df_supp.drop_duplicates(subset=['link'])

In [4]:
print(df_supp)

                                                  link  \
0           https://doi.org/10.1038/s41467-018-06162-9   
1          http://creativecommons.org/licenses/by/4.0/   
2    https://github.com/genome/genome/blob/master/l...   
3                     https://github.com/ahwagner/sclc   
4         http://cancer.sanger.ac.uk/cosmic/signatures   
..                                                 ...   
154  https://scholar.google.com/scholar_lookup?jour...   
155                https://doi.org/10.1038/nature14888   
156  https://scholar.google.com/scholar_lookup?jour...   
157            https://doi.org/10.1073/pnas.1006822107   
158  https://scholar.google.com/scholar_lookup?jour...   

                                                 title          file_info  \
0                           10.1038/s41467-018-06162-9  Unknown size/type   
1          http://creativecommons.org/licenses/by/4.0/  Unknown size/type   
2    https://github.com/genome/genome/blob/master/l...                 4

In [5]:
# Convert to DataFrame
df_supp = pd.DataFrame(df_supp)

In [6]:
df_supp

Unnamed: 0,link,title,file_info,description,section_id,section_class,a_attr_href,a_attr_class,a_attr_data-ga-action,a_attr_target,a_attr_rel
0,https://doi.org/10.1038/s41467-018-06162-9,10.1038/s41467-018-06162-9,Unknown size/type,No description,Unknown section,pmc-layout__citation font-secondary font-xs,https://doi.org/10.1038/s41467-018-06162-9,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
1,http://creativecommons.org/licenses/by/4.0/,http://creativecommons.org/licenses/by/4.0/,Unknown size/type,No description,clp_a,d-panel p,http://creativecommons.org/licenses/by/4.0/,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
2,https://github.com/genome/genome/blob/master/l...,https://github.com/genome/genome/blob/master/l...,43,No description,Sec9,No class,https://github.com/genome/genome/blob/master/l...,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
3,https://github.com/ahwagner/sclc,https://github.com/ahwagner/sclc,53,No description,Sec10,No class,https://github.com/ahwagner/sclc,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
4,http://cancer.sanger.ac.uk/cosmic/signatures,http://cancer.sanger.ac.uk/cosmic/signatures,10,No description,Sec11,No class,http://cancer.sanger.ac.uk/cosmic/signatures,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
...,...,...,...,...,...,...,...,...,...,...,...
154,https://scholar.google.com/scholar_lookup?jour...,Google Scholar,Unknown size/type,No description,Unknown section,ref-list font-sm,https://scholar.google.com/scholar_lookup?jour...,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
155,https://doi.org/10.1038/nature14888,DOI,Unknown size/type,No description,Unknown section,ref-list font-sm,https://doi.org/10.1038/nature14888,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
156,https://scholar.google.com/scholar_lookup?jour...,Google Scholar,Unknown size/type,No description,Unknown section,ref-list font-sm,https://scholar.google.com/scholar_lookup?jour...,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
157,https://doi.org/10.1073/pnas.1006822107,DOI,Unknown size/type,No description,Unknown section,ref-list font-sm,https://doi.org/10.1073/pnas.1006822107,usa-link usa-link--external,click_feat_suppl,_blank,noopener noreferrer
