In [29]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("../data/prompt_engineering/gpt_files/GPT_subset_triples_prompt2_param1.csv")

# Count unique URLs
unique_url_count = df['URL'].nunique()

print(f"Number of unique URLs: {unique_url_count}")

Number of unique URLs: 44


In [9]:
import pandas as pd
import requests
from urllib.parse import urlparse
from tqdm import tqdm

# Load the Excel file
#df = pd.read_excel("../data/CBM_data/Data_CBM.xlsx")
df = pd.read_excel("../data/CBM_data/Data_CBM_with_GitHub_URLs.xlsx")

# Get unique URLs
urls = df['GitHub_Image_URL'].dropna().unique().tolist()

# Diagnostic function
def diagnose_url(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    result = {
        'URL': url,
        'Status': None,
        'Content-Type': None,
        'Redirect': None,
        'Is HTTPS': url.startswith("https://"),
        'Likely GPT-Compatible': False,
        'Error': None
    }
    
    try:
        # HEAD first (faster), fallback to GET
        r = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
        if r.status_code in [405, 403, 400]:  # Some sites block HEAD
            r = requests.get(url, headers=headers, timeout=10, stream=True, allow_redirects=True)
        
        result['Status'] = r.status_code
        result['Content-Type'] = r.headers.get('Content-Type', None)
        result['Redirect'] = r.url if r.url != url else None
        
        # Basic heuristics for GPT compatibility
        if (
            r.status_code == 200 and
            result['Content-Type'] is not None and
            result['Content-Type'].startswith("image") and
            result['Is HTTPS']
        ):
            result['Likely GPT-Compatible'] = True

    except Exception as e:
        result['Error'] = str(e)

    return result

# Run diagnostics
diagnostics = [diagnose_url(url) for url in tqdm(urls)]

# Convert to DataFrame
df_diag = pd.DataFrame(diagnostics)

# Show problematic URLs
bad_urls = df_diag[~df_diag['Likely GPT-Compatible']]
print(f"\n⚠️ Found {len(bad_urls)} potentially problematic URLs out of {len(urls)}.")
bad_urls[['URL', 'Status', 'Content-Type', 'Redirect', 'Error']]

# Optionally: save to Excel
df_diag.to_excel("../data/CBM_data/CBM_URL_Diagnostics.xlsx", index=False)


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


⚠️ Found 0 potentially problematic URLs out of 100.





In [3]:
import pandas as pd

# Load the diagnostics file
df_diag = pd.read_excel("../data/CBM_data/CBM_URL_Diagnostics.xlsx")

# Filter only failed cases
bad_urls = df_diag[df_diag['Likely GPT-Compatible'] == False]

# Check Content-Type and extension
def is_likely_html(row):
    content_type = str(row['Content-Type']).lower()
    url = str(row['URL']).lower()
    return (
        'html' in content_type or
        'text' in content_type or
        '?' in url or
        not url.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg'))
    )

# Flag suspicious MIME type or URL pattern
bad_urls['Looks like HTML or wrapper'] = bad_urls.apply(is_likely_html, axis=1)

# Show results
print(f"⚠️ Suspicious URLs (non-image content or dynamic endpoints):")
bad_urls[bad_urls['Looks like HTML or wrapper']][['URL', 'Content-Type', 'Redirect']]


⚠️ Suspicious URLs (non-image content or dynamic endpoints):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_urls['Looks like HTML or wrapper'] = bad_urls.apply(is_likely_html, axis=1)


Unnamed: 0,URL,Content-Type,Redirect
0,https://www.mdpi.com/cells/cells-12-00816/arti...,text/html,
2,https://www.mdpi.com/cells/cells-12-00684/arti...,text/html,
10,https://www.mdpi.com/ijms/ijms-22-04081/articl...,text/html,
12,https://www.mdpi.com/jcm/jcm-10-01947/article_...,text/html,
19,https://www.mdpi.com/life/life-12-00285/articl...,text/html,
21,https://www.mdpi.com/viruses/viruses-15-01598/...,text/html,
27,https://www.mdpi.com/ijms/ijms-22-04081/articl...,text/html,
31,https://www.mdpi.com/viruses/viruses-15-00400/...,text/html,
39,https://www.mdpi.com/brainsci/brainsci-13-0013...,text/html,
44,https://www.mdpi.com/brainsci/brainsci-10-0085...,text/html,


In [7]:
import pandas as pd

# Load the file
df = pd.read_excel("../data/CBM_data/Data_CBM.xlsx")

# Add GitHub raw URL column based on Image_number
base_url = "https://raw.githubusercontent.com/Elly0w0/image-based-information-extraction-LLM-main/main/data/CBM_data/images_CBM/"

df['GitHub_Image_URL'] = df['Image_number'].apply(lambda x: f"{base_url}{x}.jpg")

# Save updated file (optional)
df.to_excel("../data/CBM_data/Data_CBM_with_GitHub_URLs.xlsx", index=False)

# Preview
df[['Image_number', 'GitHub_Image_URL']].head()

Unnamed: 0,Image_number,GitHub_Image_URL
0,image_1,https://raw.githubusercontent.com/Elly0w0/imag...
1,image_2,https://raw.githubusercontent.com/Elly0w0/imag...
2,image_3,https://raw.githubusercontent.com/Elly0w0/imag...
3,image_4,https://raw.githubusercontent.com/Elly0w0/imag...
4,image_5,https://raw.githubusercontent.com/Elly0w0/imag...


Adding URLs to full text triples file

In [30]:
import pandas as pd

# Load both files
cbm_df = pd.read_excel("../data/CBM_data/Data_CBM.xlsx")
triples_df = pd.read_excel("../data/triples_output/Triples_From_Full_Text_CBM.xlsx")

# Clean PMIDs: remove commas and convert to string for reliable merge
cbm_df["PMID_clean"] = cbm_df["PMID"].astype(str).str.replace(",", "")
triples_df["PMID_clean"] = triples_df["Article ID"].astype(str)

# Map PMID to Image URL (from 'URL' column)
pmid_to_url = cbm_df.set_index("PMID_clean")["URL"].to_dict()

# Add URL column to triples dataframe
triples_df["Image_URL"] = triples_df["PMID_clean"].map(pmid_to_url)

# Drop helper column if not needed
triples_df.drop(columns=["PMID_clean"], inplace=True)

# Save or display the updated dataframe
triples_df.to_excel("../data/triples_output/Triples_From_Full_Text_CBM_with_URLs.xlsx", index=False)
triples_df.to_csv("../data/triples_output/Triples_From_Full_Text_CBM_with_URLs.csv", index=False)

# Optionally show sample
triples_df.head()


Unnamed: 0,Article ID,Title,Paragraph,Pathophysiological Process,Subject,Predicate,Object,Image_URL
0,36899952,Pathogenesis Underlying Neurological Manifesta...,Relating specifically to neurological symptoms...,Hematogenous_Spread,SARS-CoV-2,invades,choroid_plexus_cells,https://www.mdpi.com/cells/cells-12-00816/arti...
1,36899952,Pathogenesis Underlying Neurological Manifesta...,Relating specifically to neurological symptoms...,Hematogenous_Spread,ACE2_receptors,are_expressed_on,choroid_epithelium,https://www.mdpi.com/cells/cells-12-00816/arti...
2,36899952,Pathogenesis Underlying Neurological Manifesta...,Relating specifically to neurological symptoms...,Hematogenous_Spread,SARS-CoV-2,causes,ependymal_cell_death,https://www.mdpi.com/cells/cells-12-00816/arti...
3,36899952,Pathogenesis Underlying Neurological Manifesta...,Relating specifically to neurological symptoms...,Hematogenous_Spread,SARS-CoV-2,disrupts,blood–CSF_barrier,https://www.mdpi.com/cells/cells-12-00816/arti...
4,36899952,Pathogenesis Underlying Neurological Manifesta...,"In contrast, persistent anosmia is a symptom o...",Viral_Damage_to_Olfactory_Epithelium,SARS-CoV-2,infects,non-neural_cell_types,https://www.mdpi.com/cells/cells-12-00816/arti...
