In [None]:
import requests

def fetch_html(url, output_file):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Successfully fetched HTML and saved to {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")

website_url = "https://snielebock.github.io/mrpua/Sorter.html"
output_html_file = "page_content.html"
fetch_html(website_url, output_html_file )

Successfully fetched HTML and saved to page_content.html


In [24]:
import re
import json

def extract_papers_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Regex to find the JavaScript array 'objects'
    match = re.search(r"const objects = (.*?);\s*// Function to filter objects", html_content, re.DOTALL)

    papers_data = []
    if match:
        json_str = match.group(1).strip()
        try:
            data = json.loads(json_str)
            for item in data:
                papers_data.append({
                    'id': item.get('pID'),
                    'title': item.get('Title'),
                    'artifact_url': item.get('ArtifactURL'),
                    'doi_url': item.get('DOIURL')
                })
        except json.JSONDecodeError as e:
            print(f"JSON decoding error: {e}")
            # Provide more context for debugging
            start = max(0, e.pos - 50)
            end = e.pos + 50
            print(f"Problematic string part: ...{json_str[start:end]}...")
    else:
        print("Could not find the 'objects' JavaScript array in the HTML file.")

    return papers_data

if __name__ == "__main__":
    html_file = "page_content.html"
    papers = extract_papers_from_html(html_file)
    if papers:
        print(f"Successfully extracted {len(papers)} papers.")
        # Save the extracted data to a JSON file for further processing
        with open("papers_data.json", "w", encoding="utf-8") as f:
            json.dump(papers, f, indent=4)
        print("Saved paper data to papers_data.json")
    else:
        print("Failed to extract papers.")

Successfully extracted 189 papers.
Saved paper data to papers_data.json


In [35]:
def find_and_download_pdf(doi_url, title):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        resp = requests.get(doi_url, headers=headers, allow_redirects=True, timeout=20)
        # If the final URL is a PDF, or the content-type is PDF
        if resp.url.lower().endswith('.pdf') or 'pdf' in resp.headers.get('content-type', '').lower():
            pdf_url = resp.url
        else:
            # Parse the landing page for PDF links
            soup = BeautifulSoup(resp.text, 'html.parser')
            pdf_url = None
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '.pdf' in href.lower():
                    if href.startswith('/'):
                        base_url = '{uri.scheme}://{uri.netloc}'.format(uri=requests.utils.urlparse(resp.url))
                        href = base_url + href
                    elif not href.startswith('http'):
                        href = resp.url.rstrip('/') + '/' + href
                    pdf_url = href
                    break
        if not pdf_url:
            print(f"❌ No PDF found for {doi_url}")
            return None
        # Download the PDF
        pdf_resp = requests.get(pdf_url, headers=headers, timeout=30)
        if pdf_resp.status_code == 200 and 'pdf' in pdf_resp.headers.get('content-type', '').lower():
            os.makedirs('pdf', exist_ok=True)
            safe_title = "".join(c if c.isalnum() else "_" for c in title)[:100]
            pdf_path = os.path.join('pdf', f"{safe_title}.pdf")
            with open(pdf_path, 'wb') as f:
                f.write(pdf_resp.content)
            print(f"✅ Downloaded: {pdf_path}")
            return pdf_path
        else:
            print(f"❌ Failed to download PDF from: {pdf_url}")
            return None
    except Exception as e:
        print(f"❌ Error for {doi_url}: {e}")
        return None

# Usage example:
papers = []
try:
    with open("papers_data.json", "r", encoding="utf-8") as f:
        papers = json.load(f)
except FileNotFoundError:
    print("Error: papers_data.json not found. Please run the previous steps to generate it.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from papers_data.json: {e}")

for paper in papers:
    title = paper.get("title")
    doi_url = paper.get("artifact_url")
    if doi_url:
        find_and_download_pdf(doi_url, title)


❌ No PDF found for https://github.com/wangdeze18/Multilingual-Adapter-for-SE
❌ No PDF found for https://github.com/ZJU-CTAG/CCRep
❌ No PDF found for https://github.com/ReliableCoding/REPEAT
❌ No PDF found for https://github.com/CGCL-codes/JOpFuzzer
❌ No PDF found for https://github.com/lochnagarr/JITFuzz
❌ Failed to download PDF from: https://github.com/CGCL-codes/HistFuzz/blob/main/._ICSE_23_HistFuzz.pdf
❌ No PDF found for https://github.com/youhanmo/DRFuzz
❌ No PDF found for https://github.com/TypeOracle/TypeOracleSrc
❌ Error for https://ﬁgshare.com/articles/software/Reproduction_Package_for_Data_Quality_for_Software_Vulnerability_Datasets/20499924: Failed to parse: https://ﬁgshare.com/articles/software/Reproduction_Package_for_Data_Quality_for_Software_Vulnerability_Datasets/20499924
❌ No PDF found for https://github.com/gems-uff/refactoring-merge
❌ No PDF found for https://github.com/MOB2022/MOB-dataset
❌ No PDF found for https://figshare.com/articles/conference_contribution/Debugg

In [49]:
# prompt: How to import fitz

!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1


In [50]:
# prompt: Use the pdf file to make a score card that looks at Paper Availability	Is the full paper accessible (open access or behind paywall)?	Open access scores higher; paywalled or unavailable scores lower
# # # Availability of Code and Software	Is the source code provided, accessible, and licensed? Is it version-controlled (e.g., GitHub)?	Presence of public, well-documented, versioned code repository scores higher
# # # Availability of Datasets	Are datasets used in the paper publicly available with clear access instructions?	Public, well-documented datasets score higher; proprietary or unavailable datasets score lower
# # # Computer Requirements	Are the hardware and software requirements clearly specified and reasonable?	Clear, reasonable specs (OS, memory, CPU) score higher; vague or unrealistic specs score lower
# # # GPU Requirements	Are GPU or specialized hardware requirements stated?	Explicit GPU info scores higher; missing or unclear info scores lower
# # # Documentation Quality	Quality and completeness of README, installation guides, usage instructions, API docs	Detailed, clear, and comprehensive docs score higher
# # # Ease of Setup	How easy is it to set up and run the code? Are dependencies and environment management handled?	Use of containers (Docker), environment files, CI/CD pipelines improve score
# # # Reproducibility of Results	Can the results be reproduced using the provided code, data, and instructions?	Verified reproducibility or artifact badges score highest; no verification scores lowest
# # # Overall Rating	Aggregate score reflecting the above criteria	Weighted sum or qualitative rating (Excellent, Good, Fair, Poor)

from bs4 import BeautifulSoup
import os
import json
import fitz  # PyMuPDF
import re

def analyze_pdf_for_scorecard(pdf_path):
    scorecard = {
        "Paper Availability": {"score": 0, "notes": ""},
        "Availability of Code and Software": {"score": 0, "notes": ""},
        "Availability of Datasets": {"score": 0, "notes": ""},
        "Computer Requirements": {"score": 0, "notes": ""},
        "GPU Requirements": {"score": 0, "notes": ""},
        "Documentation Quality": {"score": 0, "notes": ""},
        "Ease of Setup": {"score": 0, "notes": ""},
        "Reproducibility of Results": {"score": 0, "notes": ""},
        "Overall Rating": {"score": "N/A", "notes": "Manual review required"}
    }

    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(doc.page_count):
            text += doc.load_page(page_num).get_text()
        doc.close()

        # Analyze Paper Availability (already downloaded, implies availability)
        scorecard["Paper Availability"]["score"] = 1 # Available
        scorecard["Paper Availability"]["notes"] = "PDF was successfully downloaded."
        # Could add logic here to check if it's likely behind a paywall based on content/source if needed

        # Analyze Availability of Code and Software
        # Look for keywords and URLs indicating code availability
        code_keywords = ["github.com", "gitlab.com", "bitbucket.org", "source code", "repository", "code base", "implementation details"]
        found_code_url = False
        for keyword in code_keywords:
            if keyword in text.lower():
                scorecard["Availability of Code and Software"]["score"] = 1 # Found keywords
                scorecard["Availability of Code and Software"]["notes"] = f"Found keyword: {keyword}"
                # Further check for specific URLs
                urls = re.findall(r'https?://(?:github|gitlab|bitbucket)\.com/[\w.-]+/[\w.-]+', text, re.IGNORECASE)
                if urls:
                     scorecard["Availability of Code and Software"]["score"] = 2 # Found specific URL
                     scorecard["Availability of Code and Software"]["notes"] = f"Found potential repository URL(s): {', '.join(urls[:2])}"
                     found_code_url = True
                break
        if not found_code_url and scorecard["Availability of Code and Software"]["score"] == 0:
             scorecard["Availability of Code and Software"]["notes"] = "No obvious code availability keywords/URLs found."


        # Analyze Availability of Datasets
        dataset_keywords = ["dataset", "data set", "publicly available data", "download data", "figshare", "zenodo"]
        found_dataset_url = False
        for keyword in dataset_keywords:
            if keyword in text.lower():
                scorecard["Availability of Datasets"]["score"] = 1 # Found keywords
                scorecard["Availability of Datasets"]["notes"] = f"Found keyword: {keyword}"
                # Could add more specific URL checks for data repositories
                urls = re.findall(r'https?://(?:figshare|zenodo)\.com/[\w./-]+', text, re.IGNORECASE)
                if urls:
                     scorecard["Availability of Datasets"]["score"] = 2 # Found specific URL
                     scorecard["Availability of Datasets"]["notes"] = f"Found potential dataset URL(s): {', '.join(urls[:2])}"
                     found_dataset_url = True
                break
        if not found_dataset_url and scorecard["Availability of Datasets"]["Datsets"] == 0:
            scorecard["Availability of Datasets"]["notes"] = "No obvious dataset availability keywords/URLs found."


        # Analyze Computer Requirements
        requirements_keywords = ["requirements", "operating system", "OS", "memory", "RAM", "CPU", "processor"]
        for keyword in requirements_keywords:
            if keyword in text:
                scorecard["Computer Requirements"]["score"] = 1 # Found some requirement info
                scorecard["Computer Requirements"]["notes"] = f"Found keyword: {keyword}. Manual check needed for clarity and reasonableness."
                break
        if scorecard["Computer Requirements"]["score"] == 0:
             scorecard["Computer Requirements"]["notes"] = "No obvious computer requirement keywords found."


        # Analyze GPU Requirements
        gpu_keywords = ["GPU", "graphics card", "CUDA", "cuDNN", "nVidia", "RTX", "Titan", "A100", "V100"]
        for keyword in gpu_keywords:
            if keyword in text:
                scorecard["GPU Requirements"]["score"] = 1 # Found GPU info
                scorecard["GPU Requirements"]["notes"] = f"Found keyword: {keyword}. Manual check needed for specific requirements."
                break
        if scorecard["GPU Requirements"]["score"] == 0:
             scorecard["GPU Requirements"]["notes"] = "No obvious GPU requirement keywords found."

        # Documentation Quality, Ease of Setup, Reproducibility of Results require deeper analysis
        # This usually involves cloning the repo, checking documentation files (README, INSTALL),
        # attempting to run the code, and checking for reproducibility badges/claims.
        # Automating this from just the PDF text is very limited.
        scorecard["Documentation Quality"]["notes"] = "Requires analysis of associated code repository documentation."
        scorecard["Ease of Setup"]["notes"] = "Requires attempting to set up and run the code."
        scorecard["Reproducibility of Results"]["notes"] = "Requires attempting to reproduce results and checking for badges/claims."


    except fitz.FileDataError:
        scorecard["Paper Availability"]["score"] = -1 # Not accessible
        scorecard["Paper Availability"]["notes"] = "Could not open the downloaded PDF file."
        print(f"Error opening PDF file: {pdf_path}")
    except Exception as e:
        print(f"An error occurred while analyzing {pdf_path}: {e}")

    return scorecard

# --- Main script to process downloaded PDFs and generate scorecards ---

if __name__ == "__main__":
    pdf_directory = 'pdf'
    scorecards = {}

    if not os.path.exists(pdf_directory):
        print(f"PDF directory '{pdf_directory}' not found. Please run the download step first.")
    else:
        pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
        if not pdf_files:
            print(f"No PDF files found in '{pdf_directory}'. Please ensure downloads were successful.")
        else:
            print(f"Found {len(pdf_files)} PDF files to analyze.")
            for pdf_file in pdf_files:
                pdf_path = os.path.join(pdf_directory, pdf_file)
                print(f"\nAnalyzing: {pdf_file}")
                scorecard = analyze_pdf_for_scorecard(pdf_path)
                scorecards[pdf_file] = scorecard
                # Print a summary of the automated scores
                print("--- Automated Scorecard Summary ---")
                for criteria, details in scorecard.items():
                    print(f"{criteria}: Score={details['score']}, Notes: {details['notes']}")

            # Optionally, save the scorecards to a JSON file
            with open("scorecards.json", "w", encoding="utf-8") as f:
                json.dump(scorecards, f, indent=4)
            print("\nSaved scorecards to scorecards.json")

            # You can now manually review scorecards.json and perform deeper analysis
            # for criteria like Documentation Quality, Ease of Setup, and Reproducibility.
            # The current automated analysis from PDF text alone is limited.



Found 20 PDF files to analyze.

Analyzing: Learning_Deep_Semantics_for_Test_Completion.pdf
An error occurred while analyzing pdf/Learning_Deep_Semantics_for_Test_Completion.pdf: 'Datsets'
--- Automated Scorecard Summary ---
Paper Availability: Score=1, Notes: PDF was successfully downloaded.
Availability of Code and Software: Score=2, Notes: Found potential repository URL(s): https://github.com/EngineeringSoftware/teco., https://github.com/javaparser/javaparser.
Availability of Datasets: Score=1, Notes: Found keyword: dataset
Computer Requirements: Score=0, Notes: 
GPU Requirements: Score=0, Notes: 
Documentation Quality: Score=0, Notes: 
Ease of Setup: Score=0, Notes: 
Reproducibility of Results: Score=0, Notes: 
Overall Rating: Score=N/A, Notes: Manual review required

Analyzing: Impact_of_Code_Language_Models_on_Automated_Program_Repair.pdf
An error occurred while analyzing pdf/Impact_of_Code_Language_Models_on_Automated_Program_Repair.pdf: 'Datsets'
--- Automated Scorecard Summary 

In [51]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
from urllib.parse import urljoin, urlparse

def scrape_papers_from_sorter(url):
    """Scrape the JavaScript array of paper metadata from the Sorter.html page."""
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    script_tag = soup.find('script', string=re.compile(r'const objects = \['))
    if not script_tag:
        print("❌ Could not find the script tag with paper data.")
        return []
    match = re.search(r'const objects = (\[.*?\]);', script_tag.string, re.DOTALL)
    if not match:
        print("❌ Could not extract the objects array.")
        return []
    try:
        papers = json.loads(match.group(1))
        return papers
    except Exception as e:
        print(f"❌ JSON decode error: {e}")
        return []

def find_and_download_pdf(doi_url, title):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        print(f"🔍 Fetching: {doi_url}")
        resp = requests.get(doi_url, headers=headers, allow_redirects=True, timeout=20)
        resp.raise_for_status()
        # If the response is a PDF, save it directly
        if resp.url.lower().endswith('.pdf') or 'application/pdf' in resp.headers.get('content-type', '').lower():
            pdf_url = resp.url
        else:
            # Parse HTML for PDF links
            soup = BeautifulSoup(resp.text, 'html.parser')
            pdf_url = None
            # Try meta tag (common in Zenodo, arXiv)
            meta_pdf = soup.find('meta', {'name': 'citation_pdf_url'})
            if meta_pdf and meta_pdf.get('content'):
                pdf_url = meta_pdf['content']
            # Try <a> links
            if not pdf_url:
                for a_tag in soup.find_all('a', href=True):
                    href = a_tag['href']
                    if '.pdf' in href.lower():
                        pdf_url = urljoin(resp.url, href)
                        break
        if not pdf_url:
            print(f"❌ No PDF found for {doi_url}")
            return None
        # Download the PDF
        pdf_resp = requests.get(pdf_url, headers=headers, timeout=30)
        pdf_resp.raise_for_status()
        if 'application/pdf' not in pdf_resp.headers.get('content-type', '').lower():
            print(f"⚠️ Not a PDF at {pdf_url}. Content-Type: {pdf_resp.headers.get('content-type')}")
            return None
        os.makedirs('pdf', exist_ok=True)
        safe_title = re.sub(r'[^a-zA-Z0-9_\- ]', '', title).replace(' ', '_')[:100].strip()
        if not safe_title:
            safe_title = "downloaded_paper"
        pdf_path = os.path.join('pdf', f"{safe_title}.pdf")
        with open(pdf_path, 'wb') as f:
            f.write(pdf_resp.content)
        print(f"✅ Downloaded: {pdf_path}")
        return pdf_path
    except Exception as e:
        print(f"❌ Error for {doi_url}: {e}")
        return None

def main():
    sorter_url = "https://snielebock.github.io/mrpua/Sorter.html"
    papers = scrape_papers_from_sorter(sorter_url)
    if not papers:
        print("No papers found.")
        return
    print(f"Found {len(papers)} papers. Attempting to download PDFs...")
    downloaded_count = 0
    for paper in papers:
        title = paper.get("title")
        # Prefer DOIURL, then ArtifactURL, then ArtifactURL2
        url = paper.get("doi_url") or paper.get("artifact_url")
        if not url:
            print(f"❌ No valid URL for paper: {title}")
            continue
        pdf_path = find_and_download_pdf(doi_url, title)
        if pdf_path:
            downloaded_count += 1
    print(f"\n--- Download Summary ---")
    print(f"Successfully downloaded {downloaded_count} PDFs to the 'pdf' directory.")


if __name__ == "__main__":
    main()


Found 189 papers. Attempting to download PDFs...
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for paper: None
❌ No valid URL for pa

In [31]:
# prompt: prompt: make a json file score card that grades the pdf based on Paper Availability	Is the full paper accessible (open access or behind paywall)?	Open access scores higher; paywalled or unavailable scores lower
# # Availability of Code and Software	Is the source code provided, accessible, and licensed? Is it version-controlled (e.g., GitHub)?	Presence of public, well-documented, versioned code repository scores higher
# # Availability of Datasets	Are datasets used in the paper publicly available with clear access instructions?	Public, well-documented datasets score higher; proprietary or unavailable datasets score lower
# # Computer Requirements	Are the hardware and software requirements clearly specified and reasonable?	Clear, reasonable specs (OS, memory, CPU) score higher; vague or unrealistic specs score lower
# # GPU Requirements	Are GPU or specialized hardware requirements stated?	Explicit GPU info scores higher; missing or unclear info scores lower
# # Documentation Quality	Quality and completeness of README, installation guides, usage instructions, API docs	Detailed, clear, and comprehensive docs score higher
# # Ease of Setup	How easy is it to set up and run the code? Are dependencies and environment management handled?	Use of containers (Docker), environment files, CI/CD pipelines improve score
# # Reproducibility of Results	Can the results be reproduced using the provided code, data, and instructions?	Verified reproducibility or artifact badges score highest; no verification scores lowest
# # Overall Rating	Aggregate score reflecting the above criteria	Weighted sum or qualitative rating (Excellent, Good, Fair, Poor)

# This section focuses on creating the JSON file from the existing scorecard_data DataFrame
# Assumes scorecard_df has been created in the preceding code block

if 'scorecard_df' in locals() and not scorecard_df:
    # Convert the DataFrame to a list of dictionaries (JSON format)
    scorecard_json_data = scorecard_df.to_dict(orient='records')

    # Define the output JSON file name
    scorecard_json_file = "scorecard.json"

    # Write the data to the JSON file
    try:
        with open(scorecard_json_file, 'w', encoding='utf-8') as f:
            json.dump(scorecard_json_data, f, indent=4)
        print(f"\nScorecard data saved to {scorecard_json_file}")
    except IOError as e:
        print(f"Error writing to {scorecard_json_file}: {e}")
else:
    print("\nScorecard DataFrame is not available or is empty. Cannot create scorecard.json.")




Scorecard DataFrame is not available or is empty. Cannot create scorecard.json.
