**The Code was developed and tested in Google Collab**

The code below showcases the following:


1.   Exraction of content from pre-pasted link of aspirin data from FDA website in an excel sheet, the content will be saved and can be downloaded as word and CSV file.
2.   Adverse Event extraction from the content based on predefined lexicon along with brand names.


3.   Filtering the adverse event extraction for considering only negative events using manual filtering method and sentiment analysis.
2.   BIOPORTAL Ontology mapping.


5.   UI presentation using streamlit.
2.   Hosting the web server on ngork.








In [None]:
!pip install requests beautifulsoup4 python-docx PyMuPDF

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyMuPDF
Successfully installed PyMuPDF-1.26.3 python-docx-1.2.0


In [None]:
import os
import re
import time
import requests
import mimetypes
import pandas as pd
from bs4 import BeautifulSoup
from docx import Document
from urllib.parse import urlparse
import fitz  # PyMuPDF
from io import BytesIO
from docx import Document as DocxReader

def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^\x09\x0A\x0D\x20-\x7E\u00A0-\uFFFF]', '', text)

#Load URLs
df = pd.read_excel("/content/Aspirin_trail.xlsx")
df.columns = df.columns.str.strip()
urls = df["Links"].dropna().tolist()

doc = Document()
doc.add_heading("Aspirin FDA Text Extracts", level=0)
scraped_data = []

#Webpage content extraction
def extract_fda_text(soup):
    content = []
    for container in soup.select("div.field-item, article, div[role='main']"):
        text = container.get_text(separator=' ', strip=True)
        if text:
            content.append(text)
    if not content:
        paragraphs = soup.find_all("p")
        content = [p.get_text(strip=True) for p in paragraphs]
    return "\n".join(content)

#File content extraction
def extract_from_pdf(content_bytes):
    text = ""
    with fitz.open(stream=content_bytes, filetype="pdf") as pdf:
        for page in pdf:
            text += page.get_text()
    return text

def extract_from_docx(content_bytes):
    file_stream = BytesIO(content_bytes)
    docx_file = DocxReader(file_stream)
    return "\n".join([para.text for para in docx_file.paragraphs])

def extract_from_txt(content_bytes):
    return content_bytes.decode("utf-8", errors="ignore")

def handle_download(url, content_type, content_bytes):
    if 'pdf' in content_type:
        return extract_from_pdf(content_bytes), "PDF Document"
    elif 'word' in content_type or 'docx' in url:
        return extract_from_docx(content_bytes), "Word Document"
    elif 'text' in content_type or url.endswith(".txt"):
        return extract_from_txt(content_bytes), "Text File"
    else:
        return "[SKIPPED] Unsupported file type", "Unknown"

#URL handler
def scrape_url(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            return f"[ERROR] Status code {response.status_code}", "No Title"

        content_type = response.headers.get("Content-Type", "").lower()
        if "html" in content_type:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_tag = soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else "No Title"
            body = extract_fda_text(soup)
            return body, title
        else:
            body, filetype = handle_download(url, content_type, response.content)
            return body, filetype

    except Exception as e:
        return f"[EXCEPTION] {str(e)}", "Error"

#Main loop
for i, url in enumerate(urls):
    print(f"Processing {i+1}/{len(urls)}: {url}")
    body_text, page_title = scrape_url(url)

    if body_text.startswith("[SKIPPED]") or not body_text.strip():
        print(f"Skipping unsupported or empty content: {url}")
        continue

    scraped_data.append({
        "URL": url,
        "Title": clean_text(page_title),
        "Text": clean_text(body_text)
    })

    doc.add_heading(clean_text(page_title), level=1)
    doc.add_paragraph(clean_text(url))
    doc.add_paragraph(clean_text(body_text))

    time.sleep(1)

# Save outputs
scraped_df = pd.DataFrame(scraped_data)
scraped_df.to_csv("aspirin_scraped_texts.csv", index=False)
doc.save("aspirin_scraped_texts.docx")
print("Web + File scraping completed.")

Processing 1/101: https://medlineplus.gov/druginfo/meds/a682878.html
Processing 3/101: https://www.fda.gov/drugs/understanding-over-counter-medicines/safe-use-aspirin
Processing 4/101: https://www.fda.gov/drugs/safe-use-aspirin/aspirin-questions-and-answers
Processing 5/101: https://www.fda.gov/drugs/drug-safety-and-availability/fda-drug-safety-communication-fda-warns-about-serious-bleeding-risk-over-counter-antacid-products
Processing 6/101: https://www.fda.gov/drugs/postmarket-drug-safety-information-patients-and-providers/information-about-taking-ibuprofen-and-aspirin-together
Processing 7/101: https://www.fda.gov/animal-veterinary/product-safety-information/dear-veterinarian-letter-regarding-use-aspirin-products-lactating-dairy-cattle
Processing 8/101: https://www.fda.gov/consumers/articulos-para-el-consumidor-en-espanol/advertencia-los-antiacidos-que-contienen-aspirina-pueden-causar-sangrado
Processing 9/101: https://www.fda.gov/drugs/safe-use-aspirin/aspirin-reducing-your-risk-he

In [None]:
import pandas as pd
import spacy
from spacy.matcher import Matcher

df = pd.read_csv("aspirin_scraped_texts.csv")
nlp = spacy.load("en_core_web_sm")

ae_lexicon = [
    "abdominal pain", "acid reflux", "allergic reaction", "anaemia", "anxiety",
    "asthma", "back pain", "bleeding", "blood in stool", "bruising", "chest pain",
    "confusion", "constipation", "cough", "diarrhoea", "dizziness", "drowsiness",
    "dry mouth", "dyspnoea", "ear ringing", "eczema", "fatigue", "fever",
    "flatulence", "flushing", "gastritis", "gastrointestinal bleeding",
    "hair loss", "headache", "hearing loss", "heartburn", "hives", "hypertension",
    "indigestion", "internal bleeding", "itching", "joint pain", "kidney failure",
    "light-headedness", "liver toxicity", "loss of appetite", "mouth ulcers",
    "muscle pain", "nausea", "nosebleed", "palpitations", "rash", "renal failure",
    "seizure", "shortness of breath", "skin irritation", "sleepiness",
    "stomach pain", "sweating", "ulcer", "upper GI bleeding", "urine discolouration",
    "vertigo", "vomiting", "weakness"
]

aspirin_brand_names = ["Bayer", "Bufferin", "Ecotrin", "Ascriptin", "Anacin", "Excedrin", "Disprin", "Alka-Seltzer",
    "Aspilet", "Cardiprin", "CVS Health Aspirin", "Walgreens Aspirin", "Kirkland Signature Aspirin",
    "Rite Aid Aspirin", "Equate Aspirin", "St. Joseph Low Dose Aspirin", "Nu-Seals Aspirin", "Caprin",
    "Boots Aspirin", "Ecosprin", "ASA", "Aspirin IP", "Clopivas-AP", "Cartia", "Astrix", "Solprin",
    "Herron Aspirin", "Halfprin", "Durlaza", "Fasprin", "Aspocid", "Disprin CV", "Aspir-low",
    "Aspirin Protect", "Aspir 81", "ASA EC", "Aspro Clear", "Aspro", "Aspirina Bayer", "Micropirin",
    "HeartSure Aspirin", "Acetylsalicylic Acid Tablets", "Cartia XT", "Asasantin", "Heron Low Dose Aspirin",
    "Aspro Clear Extra", "Aspirin Cardio", "Aspilet 81 mg", "Aspenter", "Thrombo ASS", "Coraspin",
    "Aspirin Protect 100", "Aspirin Dispersible IP", "Aspirin BP", "Aspocid 75", "Aspicot", "Ecosprin AV",
    "Clopitab A", "Aspin", "Enteric Coated Aspirin", "Low Dose Aspirin 81 mg", "Baby Aspirin", "ASA Tablets"
]

def detect_brand(sentence, brands):
    for brand in brands:
        if brand.lower() in sentence.lower():
            return brand
    return "Generic"

matcher = Matcher(nlp.vocab)
for ae in ae_lexicon:
    pattern = [{"LOWER": token.lower()} for token in ae.split()]
    matcher.add(ae, [pattern])

results = []
for idx, row in df.iterrows():
    doc = nlp(str(row["Text"]))
    matches = matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        sentence = span.sent
        ae_term = span.text.strip()
        brand = detect_brand(sentence.text, aspirin_brand_names)

        results.append({
            "URL": row["URL"],
            "Sentence": sentence.text.strip(),
            "AE_Term": ae_term,
            "Brand_Name": brand,
            "Rule": "Lexicon Pattern Match"
        })

output_df = pd.DataFrame(results)
output_df.to_csv("aspirin_ae_with_brands.csv", index=False)

print("AE extraction with brand names complete.")

AE extraction with brand names complete.


In [None]:
import pandas as pd

# Load the AE DataFrame
ae_df = pd.read_csv("/content/aspirin_ae_with_brands.csv")

# List of positive keywords-manual filtering
positive_keywords = [
    "relief", "relieved", "improved", "improvement", "better", "effective", "effectively",
    "alleviated", "alleviation", "reduced", "reduction", "controlled", "resolved", "stable",
    "managed", "beneficial", "helped", "helpful", "successfully", "positive response",
    "pain-free", "tolerated", "no adverse event", "well tolerated", "responded well",
    "good outcome", "no issues", "no side effects", "no symptoms", "enhanced", "less pain",
    "comfort", "safe", "safety", "no problems", "symptom-free", "well-being", "cured",
    "recover", "recovered", "normal", "without incident", "healed", "immunity", "boosted",
    "prevented", "preventive", "ameliorated", "reassuring", "no complications",
    "no concerns", "non-serious", "resolved spontaneously", "subsided", "tolerable"
]

#Only keep rows where none of the positive keywords appear in the sentence
def is_negative_ae(sentence):
    sentence_lower = str(sentence).lower()
    return not any(pos_kw in sentence_lower for pos_kw in positive_keywords)

negative_ae_df = ae_df[ae_df["Sentence"].apply(is_negative_ae)]

# Save to CSV
negative_ae_path = "aspirin_negative_ae_only.csv"
negative_ae_df.to_csv(negative_ae_path, index=False)

negative_ae_path


'aspirin_negative_ae_only.csv'

In [None]:
import pandas as pd
from textblob import TextBlob

# Load annotated AE file
ae_df = pd.read_csv("aspirin_ae_with_brands.csv")  # Rename this as needed

#Positive keywords filter
positive_keywords = [
    "relief", "relieved", "improved", "improvement", "better", "effective", "effectively",
    "alleviated", "alleviation", "reduced", "reduction", "controlled", "resolved", "stable",
    "managed", "beneficial", "helped", "helpful", "successfully", "positive response",
    "pain-free", "tolerated", "no adverse event", "well tolerated", "responded well",
    "good outcome", "no issues", "no side effects", "no symptoms", "enhanced", "less pain",
    "comfort", "safe", "safety", "no problems", "symptom-free", "well-being", "cured",
    "recover", "recovered", "normal", "without incident", "healed", "immunity", "boosted",
    "prevented", "preventive", "ameliorated", "reassuring", "no complications",
    "no concerns", "non-serious", "resolved spontaneously", "subsided", "tolerable"
]

# Remove positive sentences
def is_potentially_negative(sentence):
    sentence_lower = str(sentence).lower()
    return not any(keyword in sentence_lower for keyword in positive_keywords)

keyword_filtered_df = ae_df[ae_df["Sentence"].apply(is_potentially_negative)]

# Sentiment analysis to keep only negative-toned sentences
def is_negative_sentiment(sentence):
    try:
        polarity = TextBlob(str(sentence)).sentiment.polarity
        return polarity < 0
    except:
        return False

final_filtered_df = keyword_filtered_df[keyword_filtered_df["Sentence"].apply(is_negative_sentiment)]

# Save to CSV
final_filtered_df.to_csv("aspirin_negative_ae_final.csv", index=False)
print("Final file saved as 'aspirin_negative_ae_final.csv'")


Final file saved as 'aspirin_negative_ae_final.csv'


In [None]:
import pandas as pd
import requests
from urllib.parse import quote

# Load filtered AE file
df = pd.read_csv("aspirin_negative_ae_final.csv")

# Setup BIOPortal
API_KEY = "6b9bc62c-283d-4ec5-805c-60f926e45feb"
BIOPORTAL_URL = "http://data.bioontology.org/search?q={}&require_exact_match=true&apikey={}"

# Get unique AE terms
unique_terms = df['AE_Term'].dropna().unique()

# Map each AE term using BioPortal
results = []
for term in unique_terms:
    encoded_term = quote(term)
    url = BIOPORTAL_URL.format(encoded_term, API_KEY)
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data['collection']:
                best_match = data['collection'][0]
                results.append({
                    "AE_Term": term,
                    "Mapped_ID": best_match.get('@id', ''),
                    "Preferred_Label": best_match.get('prefLabel', ''),
                    "Ontology": best_match.get('links', {}).get('ontology', '').split('/')[-1]
                })
            else:
                results.append({
                    "AE_Term": term,
                    "Mapped_ID": '',
                    "Preferred_Label": '',
                    "Ontology": ''
                })
        else:
            print(f"HTTP error for {term}: {response.status_code}")
    except Exception as e:
        print(f"Exception for {term}: {e}")
        results.append({
            "AE_Term": term,
            "Mapped_ID": '',
            "Preferred_Label": '',
            "Ontology": ''
        })

# Merge mapped results back into the filtered AE dataset
mapped_df = pd.DataFrame(results)
final_mapped_df = pd.merge(df, mapped_df, on="AE_Term", how="left")

# Save output
output_path = "aspirin_negative_ae_final_mapped.csv"
final_mapped_df.to_csv(output_path, index=False)

print(f"Ontology mapping complete. Saved to {output_path}")


Ontology mapping complete. Saved to aspirin_negative_ae_final_mapped.csv


In [None]:
#FaceBook model for AE filtering
INPUT_FILE = "/content/aspirin_ae_with_brands.csv"
OUTPUT_FILE = "/content/aspirin_ae_clinicalbert_zero_shot_final.csv"

df = pd.read_csv(INPUT_FILE)

required_cols = ["Sentence", "AE_Term", "Brand_Name", "URL"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}")

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ["adverse event", "normal statement", "beneficial effect"]

results = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    sentence = row["Sentence"]
    ae_term = row["AE_Term"]
    brand = row["Brand_Name"]
    url = row["URL"]

    if not isinstance(sentence, str) or sentence.strip() == "":
        continue

    try:
        output = classifier(sentence, candidate_labels)
        prediction_label = output["labels"][0]
        confidence_score = round(output["scores"][0], 4)

        results.append({
            "Sentence": sentence,
            "AE Term": ae_term,
            "Brand": brand,
            "Prediction Label": prediction_label,
            "Confidence Score": confidence_score,
            "Paper Link": url
        })
    except Exception as e:
        print(f"Error processing sentence: {e}")
        continue

df_out = pd.DataFrame(results)

df_out.to_csv(OUTPUT_FILE, index=False)
print(f"Classification complete. File saved to: {OUTPUT_FILE}")

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd

@st.cache_data
def load_data():
    return pd.read_csv("aspirin_negative_ae_final_mapped.csv")

df = load_data()

st.set_page_config(page_title="AE Viewer", layout="wide")
st.title("Aspirin Adverse Events Ontology Mapping")

# Sidebar Filters
with st.sidebar:
    st.header("Filter Options")
    ae_terms = st.multiselect("Filter by AE Term", sorted(df["AE_Term"].dropna().unique()))
    brands = st.multiselect("Filter by Brand Name", sorted(df["Brand_Name"].dropna().unique()))
    ontologies = st.multiselect("Filter by Ontology", sorted(df["Ontology"].dropna().unique()))

    filtered_df = df.copy()
    if ae_terms:
        filtered_df = filtered_df[filtered_df["AE_Term"].isin(ae_terms)]
    if brands:
        filtered_df = filtered_df[filtered_df["Brand_Name"].isin(brands)]
    if ontologies:
        filtered_df = filtered_df[filtered_df["Ontology"].isin(ontologies)]

# Display Results
st.markdown(f"### Showing {len(filtered_df)} Records")
st.dataframe(filtered_df, use_container_width=True)

# Summary Section
st.markdown("### Summary")
col1, col2, col3 = st.columns(3)
col1.metric("Unique AE Terms", df["AE_Term"].nunique())
col2.metric("Mapped Ontologies", df["Ontology"].nunique())
col3.metric("Total Records", len(df))

# File Download
st.markdown("### Download")
st.download_button(
    label="Download Full CSV",
    data=df.to_csv(index=False).encode("utf-8"),
    file_name="aspirin_negative_ae_final_mapped.csv",
    mime="text/csv"
)

st.download_button(
    label="Download Filtered CSV",
    data=filtered_df.to_csv(index=False).encode("utf-8"),
    file_name="filtered_aspirin_ae.csv",
    mime="text/csv"
)


Writing app.py


In [None]:
from google.colab import files
uploaded = files.upload()

Saving aspirin_negative_ae_final_mapped.csv to aspirin_negative_ae_final_mapped (1).csv


In [None]:
!pip install streamlit pyngrok --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ngork to launch the UI
from pyngrok import ngrok
import time
ngrok.set_auth_token("2zjUnOGKNzBXtqR9A0lPIFRAdH5_fRSU4vT3LWBugMkACPLe")

!pkill streamlit
public_url = ngrok.connect(8501)
print("Public URL:", public_url)
!streamlit run app.py &>/content/log.txt &
time.sleep(10)
print("App is now running!")


Public URL: NgrokTunnel: "https://1842c14cce33.ngrok-free.app" -> "http://localhost:8501"
App is now running!
