In [1]:
import requests

def fetch_html(url, output_file):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Successfully fetched HTML and saved to {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")

website_url = "https://snielebock.github.io/mrpua/Sorter.html"
output_html_file = "page_content.html"
fetch_html(website_url, output_html_file )

Successfully fetched HTML and saved to page_content.html


In [2]:
import re
import json

def extract_papers_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Regex to find the JavaScript array 'objects'
    match = re.search(r"const objects = (.*?);\s*// Function to filter objects", html_content, re.DOTALL)

    papers_data = []
    if match:
        json_str = match.group(1).strip()
        try:
            data = json.loads(json_str)
            for item in data:
                papers_data.append({
                    'id': item.get('pID'),
                    'title': item.get('Title'),
                    'artifact_url': item.get('ArtifactURL'),
                    'doi_url': item.get('DOIURL')
                })
        except json.JSONDecodeError as e:
            print(f"JSON decoding error: {e}")
            # Provide more context for debugging
            start = max(0, e.pos - 50)
            end = e.pos + 50
            print(f"Problematic string part: ...{json_str[start:end]}...")
    else:
        print("Could not find the 'objects' JavaScript array in the HTML file.")

    return papers_data

if __name__ == "__main__":
    html_file = "page_content.html"
    papers = extract_papers_from_html(html_file)
    if papers:
        print(f"Successfully extracted {len(papers)} papers.")
        # Save the extracted data to a JSON file for further processing
        with open("papers_data.json", "w", encoding="utf-8") as f:
            json.dump(papers, f, indent=4)
        print("Saved paper data to papers_data.json")
    else:
        print("Failed to extract papers.")

Successfully extracted 189 papers.
Saved paper data to papers_data.json


In [3]:
import json
import re

def score_paper(paper):
    score = 0
    notes = []

    artifact_url = paper.get("artifact_url", "").lower()
    artifact_url2 = paper.get("artifact_url2", "").lower() # Assuming a second artifact URL might exist

    # Helper to check if any of the URLs contain a keyword
    def url_contains(keyword):
        return keyword in artifact_url or keyword in artifact_url2

    # 1. Code & Environment (40 points)
    # 1.1 Code Availability & Quality
    if url_contains("github.com"):
        score += 5  # Open-source license (assumption: GitHub implies open-source)
        notes.append("Code available on GitHub (assumed open-source license).")
    elif url_contains("zenodo.org") or url_contains("figshare.com") or url_contains("doi.org"):
        score += 2  # Data/artifact repository, might contain code or links to code
        notes.append("Artifact on data repository (potential for code).")
    else:
        notes.append("Code availability uncertain from URL.")

    # Docker/Containerization (10 points): Requires manual check
    notes.append("Docker/Containerization: Requires manual check of the repository.")

    # Dependency Management (10 points): Requires manual check
    notes.append("Dependency Management: Requires manual check of the repository.")

    # Build Instructions (10 points): Requires manual check
    notes.append("Build Instructions: Requires manual check of the repository README.")

    # Specialized Hardware Support (5 points): Requires manual check
    notes.append("Specialized Hardware Support: Requires manual check of the repository.")

    # 1.2 Reproducibility Testing
    # CI/CD Pipelines (5 points): Cannot be inferred from URL
    notes.append("CI/CD Pipelines: Cannot be inferred from URL. Requires manual check.")

    # Version Control (5 points): Assumed if on GitHub
    if url_contains("github.com"):
        score += 5
        notes.append("Version Control: Assumed via GitHub.")
    else:
        notes.append("Version Control: Uncertain from URL.")

    # 2. Documentation & Transparency (30 points)
    # 2.1 Artifact Documentation
    # Comprehensive README (10 points): Requires manual check
    notes.append("Comprehensive README: Requires manual check of the repository.")

    # API/Data Schema Docs (5 points): Requires manual check
    notes.append("API/Data Schema Docs: Requires manual check of the repository.")

    # Reproducibility Badge (5 points): Cannot be inferred from URL
    notes.append("Reproducibility Badge: Cannot be inferred from URL. Requires manual check.")

    # 2.2 Reproducibility Claims
    # Runtime Instructions (5 points): Requires manual check
    notes.append("Runtime Instructions: Requires manual check of the repository.")

    # Result Validation (5 points): Requires manual check
    notes.append("Result Validation: Requires manual check of the repository.")

    # 3. Data & Model Reuse (20 points)
    # 3.1 Data Accessibility
    # Public Dataset Links (10 points): If Zenodo/Figshare/DOI, assume data is accessible
    if url_contains("zenodo.org") or url_contains("figshare.com") or url_contains("doi.org"):
        score += 10
        notes.append("Public Dataset Links: Data likely accessible via data repository.")
    else:
        notes.append("Public Dataset Links: Data accessibility uncertain from URL.")

    # Data Preprocessing (5 points): Requires manual check
    notes.append("Data Preprocessing: Requires manual check of the repository.")

    # Model Weights (5 points): Requires manual check
    notes.append("Model Weights: Requires manual check of the repository.")

    # 4. Community Engagement (10 points)
    # Issue Tracking (5 points): Assumed if on GitHub
    if url_contains("github.com"):
        score += 5
        notes.append("Issue Tracking: Assumed via GitHub.")
    else:
        notes.append("Issue Tracking: Uncertain from URL.")

    # Discussion Forum (5 points): Cannot be inferred from URL
    notes.append("Discussion Forum: Cannot be inferred from URL. Requires manual check.")

    return {"score": score, "notes": notes}

if __name__ == "__main__":
    # Load the extracted papers data
    try:
        with open("papers_data.json", "r", encoding="utf-8") as f:
            papers = json.load(f)
    except FileNotFoundError:
        print("Error: papers_data.json not found. Please run extract_papers.py first.")
        papers = []

    scored_papers = []
    for paper in papers:
        evaluation = score_paper(paper)
        paper["reproducibility_score"] = evaluation["score"]
        paper["reproducibility_notes"] = evaluation["notes"]
        scored_papers.append(paper)

    # Save the scored papers to a new JSON file
    with open("scored_papers.json", "w", encoding="utf-8") as f:
        json.dump(scored_papers, f, indent=4)

    print(f"Evaluated {len(scored_papers)} papers. Results saved to scored_papers.json")

    # Print a summary of top 5 scored papers
    print("\nTop 5 Scored Papers (based on automated scoring):")
    sorted_papers = sorted(scored_papers, key=lambda x: x.get("reproducibility_score", 0), reverse=True)
    for i, paper in enumerate(sorted_papers[:5]):
        # Corrected f-string to avoid SyntaxError with parentheses in title
        print("Paper {}: {} (Score: {})".format(i+1, paper.get("title"), paper.get("reproducibility_score")))
        for note in paper.get("reproducibility_notes", []):
            print(f"  - {note}")

Evaluated 189 papers. Results saved to scored_papers.json

Top 5 Scored Papers (based on automated scoring):
Paper 1: One Adapter for All Programming Languages?
Adapter Tuning for Code Search and Summarization (Score: 15)
  - Code available on GitHub (assumed open-source license).
  - Docker/Containerization: Requires manual check of the repository.
  - Dependency Management: Requires manual check of the repository.
  - Build Instructions: Requires manual check of the repository README.
  - Specialized Hardware Support: Requires manual check of the repository.
  - CI/CD Pipelines: Cannot be inferred from URL. Requires manual check.
  - Version Control: Assumed via GitHub.
  - Comprehensive README: Requires manual check of the repository.
  - API/Data Schema Docs: Requires manual check of the repository.
  - Reproducibility Badge: Cannot be inferred from URL. Requires manual check.
  - Runtime Instructions: Requires manual check of the repository.
  - Result Validation: Requires manual 

In [4]:
import json

def generate_report(input_file, output_file):
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            papers = json.load(f)
    except FileNotFoundError:
        print(f"Error: {input_file} not found.")
        return

    report_content = "# Reproducibility Scorecard Report\n\n"
    report_content += "This report summarizes the reproducibility evaluation of papers scraped from the provided website, based on the defined scorecard criteria.\n\n"

    report_content += "## All Scored Papers\n\n"
    report_content += "| Paper ID | Title | Score | Artifact URL | DOI URL |\n"
    report_content += "|---|---|---|---|---|\n"
    for paper in papers:
        title = paper.get("title", "N/A").replace("\n", " ") # Replace newlines for table formatting
        artifact_url = paper.get("artifact_url", "N/A")
        doi_url = paper.get("doi_url", "N/A")
        # Corrected line to avoid f-string syntax error
        report_content += "| {} | {} | {} | {} | {} |\n".format(paper.get("id", "N/A"), title, paper.get("reproducibility_score", "N/A"), artifact_url, doi_url)

    report_content += "\n## Detailed Evaluation Notes\n\n"
    for paper in papers:
        # Corrected line to avoid f-string syntax error
        report_content += "### Paper ID: {} - {}\n\n".format(paper.get("id", "N/A"), paper.get("title", "N/A").replace("\n", " "))
        report_content += "**Score:** {}\n\n".format(paper.get("reproducibility_score", "N/A"))
        report_content += "**Notes:**\n"
        for note in paper.get("reproducibility_notes", []):
            report_content += f"- {note}\n"
        report_content += "\n"

    report_content += "\n## Top 5 Scored Papers (Automated Scoring)\n\n"
    sorted_papers = sorted(papers, key=lambda x: x.get("reproducibility_score", 0), reverse=True)
    for i, paper in enumerate(sorted_papers[:5]):
        # Corrected line to avoid f-string syntax error
        report_content += "### {}. {} (Score: {}.0)\n\n".format(i+1, paper.get("title", "N/A").replace("\n", " "), paper.get("reproducibility_score", "N/A"))
        report_content += "**Artifact URL:** " + paper.get("artifact_url", "N/A") + "\n"
        report_content += "**DOI URL:** " + paper.get("doi_url", "N/A") + "\n\n"
        report_content += "**Notes:**\n"
        for note in paper.get("reproducibility_notes", []):
            report_content += f"- {note}\n"
        report_content += "\n"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(report_content)
    print(f"Report generated and saved to {output_file}")

if __name__ == "__main__":
    generate_report("scored_papers.json", "reproducibility_report.md")

Report generated and saved to reproducibility_report.md


In [5]:
import json

def view_all_paper_scores(file_path="scored_papers.json"):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            papers = json.load(f)
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure the file exists.")
        return

    print("\n--- All Paper Scores ---")
    for paper in papers:
        title = paper.get("title", "N/A").replace("\n", " ")
        score = paper.get("reproducibility_score", "N/A")
        artifact_url = paper.get("artifact_url", "N/A")
        doi_url = paper.get("doi_url", "N/A")

        # Corrected print statements using .format() to avoid the f-string error
        print("\nPaper ID: {}".format(paper.get("id", "N/A")))
        print("Title: {}".format(title))
        print("Score: {}".format(score))
        print("Artifact URL: {}".format(artifact_url))
        print("DOI URL: {}".format(doi_url))
        print("Notes:")
        for note in paper.get("reproducibility_notes", []):
            print(f"  - {note}")
    print("\n------------------------")

if __name__ == "__main__":
    # Make sure you have the 'scored_papers.json' file in your Colab environment first
    # by running the previous scripts or uploading the file.
    view_all_paper_scores()


--- All Paper Scores ---

Paper ID: 13
Title: One Adapter for All Programming Languages? Adapter Tuning for Code Search and Summarization
Score: 15
Artifact URL: https://github.com/wangdeze18/Multilingual-Adapter-for-SE
DOI URL: https://doi.org/10.1109/ICSE48619.2023.00013
Notes:
  - Code available on GitHub (assumed open-source license).
  - Docker/Containerization: Requires manual check of the repository.
  - Dependency Management: Requires manual check of the repository.
  - Build Instructions: Requires manual check of the repository README.
  - Specialized Hardware Support: Requires manual check of the repository.
  - CI/CD Pipelines: Cannot be inferred from URL. Requires manual check.
  - Version Control: Assumed via GitHub.
  - Comprehensive README: Requires manual check of the repository.
  - API/Data Schema Docs: Requires manual check of the repository.
  - Reproducibility Badge: Cannot be inferred from URL. Requires manual check.
  - Runtime Instructions: Requires manual chec

In [6]:
import os
import requests

def check_file_exists(repo_path, filename):
    return os.path.isfile(os.path.join(repo_path, filename))

def check_github_ci(repo_path):
    return os.path.isdir(os.path.join(repo_path, '.github', 'workflows'))

def check_github_issues(repo_owner, repo_name):
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}"
    response = requests.get(url)
    data = response.json()
    return data.get("has_issues", False)

# Example usage for a local clone of a GitHub repo
repo_path = "/path/to/cloned/repo"
print("README:", check_file_exists(repo_path, "README.md"))
print("Dockerfile:", check_file_exists(repo_path, "Dockerfile"))
print("Dependencies:", check_file_exists(repo_path, "requirements.txt"))
print("CI/CD:", check_github_ci(repo_path))
print("License:", check_file_exists(repo_path, "LICENSE"))


README: False
Dockerfile: False
Dependencies: False
CI/CD: False
License: False


In [7]:
import json
import requests
from urllib.parse import urlparse

# List of files/folders to check for reproducibility
CHECK_FILES = [
    "README.md",
    "Dockerfile",
    "requirements.txt",
    "environment.yml",
    "Pipfile",
    ".github/workflows",  # For CI/CD
    "LICENSE",
    "setup.py"
]

def get_github_repo_info(artifact_url):
    """
    Extracts owner and repo name from a GitHub URL.
    """
    try:
        parsed = urlparse(artifact_url)
        if "github.com" not in parsed.netloc:
            return None, None
        parts = parsed.path.strip("/").split("/")
        if len(parts) < 2:
            return None, None
        return parts[0], parts[1]
    except Exception:
        return None, None

def github_file_exists(owner, repo, path):
    """
    Checks if a file or folder exists in the root of the GitHub repo using the GitHub API.
    """
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    response = requests.get(api_url)
    return response.status_code == 200

def check_github_files(artifact_url):
    """
    Checks for the presence of key reproducibility files in a GitHub repo.
    Returns a dict {filename: exists (bool)}
    """
    owner, repo = get_github_repo_info(artifact_url)
    if not owner or not repo:
        return {}
    results = {}
    for filename in CHECK_FILES:
        exists = github_file_exists(owner, repo, filename)
        results[filename] = exists
    return results

if __name__ == "__main__":
    # Load paper data
    with open("papers_data.json", "r", encoding="utf-8") as f:
        papers = json.load(f)

    for paper in papers:
        artifact_url = paper.get("artifact_url", "")
        if "github.com" in artifact_url:
            file_status = check_github_files(artifact_url)
            paper["github_file_check"] = file_status
        else:
            paper["github_file_check"] = {}

    # Save results
    with open("papers_file_checked.json", "w", encoding="utf-8") as f:
        json.dump(papers, f, indent=2)

    print("Checked files for all GitHub repositories and saved to papers_file_checked.json")


Checked files for all GitHub repositories and saved to papers_file_checked.json


In [8]:
import json

with open("papers_file_checked.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

for paper in papers:
    print(f"Paper: {paper.get('title')}")
    for fname, exists in paper.get("github_file_check", {}).items():
        print(f"  {fname}: {'Found' if exists else 'Not found'}")
    print("-" * 40)

Paper: One Adapter for All Programming Languages?
Adapter Tuning for Code Search and Summarization
  README.md: Found
  Dockerfile: Not found
  requirements.txt: Not found
  environment.yml: Not found
  Pipfile: Not found
  .github/workflows: Not found
  LICENSE: Not found
  setup.py: Not found
----------------------------------------
Paper: CCRep: Learning Code Change Representations via Pre-Trained Code Model and Query Back
  README.md: Found
  Dockerfile: Not found
  requirements.txt: Found
  environment.yml: Not found
  Pipfile: Not found
  .github/workflows: Not found
  LICENSE: Found
  setup.py: Not found
----------------------------------------
Paper: Keeping Pace with Ever-Increasing Data: Towards
Continual Learning of Code Intelligence Models
  README.md: Found
  Dockerfile: Not found
  requirements.txt: Not found
  environment.yml: Not found
  Pipfile: Not found
  .github/workflows: Not found
  LICENSE: Not found
  setup.py: Not found
----------------------------------------
