In [20]:
import requests
import os
import base64

GITHUB_TOKEN = os.environ["GITHUB_API"]

In [28]:
#Check for setup files

SETUP_FILES = {
    "Python": ["requirements.txt", "setup.py"],
    "C/C++": ["CMakeLists.txt", "Makefile"],
    "Java": ["pom.xml", "build.gradle"],
    "R": ["DESCRIPTION", "NAMESPACE"],
    "Julia": ["Project.toml", "Manifest.toml"]
}

DOCKER_FILES = {
    "Docker": ["Dockerfile", "docker-compose.yml"]
}

# GitHub API headers
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

def get_repo_files(url):
    """Fetch the file structure of the repository."""
    parts = url.split("/")
    owner = parts[3]
    repo = parts[4]
    url = f"https://api.github.com/repos/{owner}/{repo}/contents"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, {response.json().get('message', 'No message')}")
        return []

# to set to docker change setup_files to DOCKER_FILES
def find_setup_files(files, setup_files = SETUP_FILES):
    """Check for the presence of setup files."""
    found_files = {}
    for language, filenames in setup_files.items():
        found_files[language] = []
        for file in files:
            for filename in filenames:
                if file["name"].lower() == filename.lower():
                    True
    return False

In [22]:
# check for setup in readme

import re

def check_keywords(readme_text, keywords):
    """Check for presence of keywords in README."""

    keyword_matches = {keyword: re.search(rf"\b{keyword}\b", readme_text, re.IGNORECASE) 
                       for keyword in keywords}
    return {kw: match.group(0) for kw, match in keyword_matches.items() if match}

def enviroment_setup(readme):
    """
    Check if readme includes how to setup the environment / dependencies
    """
    keywords = [
        # General setup terms
        "requirements", "dependencies", "environment", "install", "setup", 
        "virtualenv", "pip", "docker", "build", "configuration",
        "environment.yaml", "requirements.txt", 

        # MATLAB-specific terms
        "MATLAB", "matlabpath", "toolbox", "mex", "matlab script", "matlab install",

        # C/C++-specific terms
        "makefile", "gcc", "g\+\+", "cmake", "make", "compile", "C compiler", "C\+\+ compiler",

        # Java-specific terms
        "JDK", "Maven", "Gradle", "java -jar", "javac", "java version", "Java SDK",

        # R-specific terms
        "R package", "CRAN", "install.packages", "Rscript", "R environment", "R version"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False


In [31]:
import re

def get_readme_from_github(url):
    parts = url.split("/")
    owner = parts[3]
    repo = parts[4]

    # GitHub API URL for the repository README
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"token {GITHUB_TOKEN}",
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        readme_data = response.json()
        readme_content_base64 = readme_data.get("content", "")
        # Decode the base64 content
        readme_content = base64.b64decode(readme_content_base64).decode('utf-8')
        return readme_content
    else:
        print(f"Error: Unable to fetch README (status code: {response.status_code}) " + url)
        return ""

def check_keywords(readme_text, keywords):
    """Check for presence of keywords in README."""

    keyword_matches = {keyword: re.search(rf"\b{keyword}\b", readme_text, re.IGNORECASE) 
                       for keyword in keywords}
    return {kw: match.group(0) for kw, match in keyword_matches.items() if match}


def shell_commands(readme):
    """
    Check if readme includes commands that can be run to reproduce data
    """

    keywords = [
        "command line", "terminal", "CLI", "script", "bash", "shell"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def notebook_commands(readme):
    """
    Check if readme includes a notebook that can be run to reproduce data
    """
    '''keywords = [
        "run", "execute", "executable", "command", "reproduce", "steps", "usage", 
        "experiment", "reproduction", "how to run", "command line", 
        "terminal", "CLI", "script", "bash", "shell", "notebook", "ipynb", "notebooks"
    ]'''

    keywords = [
        "notebook", "ipynb", "notebooks"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def contains_runnable_code(readme_content):
    """
    Check if the README contains mentions of runnable code files.
    
    Parameters:
        readme_content (str): The content of the README file as a string.
    
    Returns:
        bool: True if any runnable code file is mentioned, False otherwise.
    """
    # Define regex pattern for runnable code files
    code_file_pattern = r'\b[\w-]+\.(py|c|cpp|java|r|m|jl|ipynb)\b'
    
    # Search for the pattern in the README content
    matches = re.findall(code_file_pattern, readme_content, re.IGNORECASE)
    
    if matches:
        # print(f"Found runnable code files: {set(matches)}")
        return True
    else:
        # print("No runnable code files found.")
        return False

In [39]:
import json

def get_title_and_links(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
        
        results = []
        for entry in data:
            if 'title' in entry and 'github_urls' in entry:
                results.append({
                    "title": entry['title'],
                    "github_urls": entry['github_urls']
                })
        
        return results

In [40]:
github_info = get_title_and_links("/Users/ruth/Downloads/rr-measure-basic/P1-v1/github_data.json")

In [41]:
import pandas as pd

columns = [
        "Title",
        "Github_urls",
        "setup_files",
        "docker_files",
        "setup_info_in_readme",
        "shell_instructions_in_readme",
        "notebook_instructions_in_readme",
        "code_instructions_in_readme"
    ]

df = pd.DataFrame(columns=columns)

for col in columns[1:]:
        df[col] = df[col].astype(object)

In [47]:
for item in github_info:
    row = [item["title"]]
    setup_files_list = []
    docker_files_list = []
    setup_info_list = []
    shell_ins_list = []
    notebook_ins_list = []
    code_ins_list = []
    
    for link in item["github_urls"]:
        readme = get_readme_from_github(link)
        files = get_repo_files(link)
        if readme or files:
            setup_files_list.append(find_setup_files(files))
            docker_files_list.append(find_setup_files(files, setup_files = DOCKER_FILES))
            setup_info_list.append(enviroment_setup(readme))
            shell_ins_list.append(shell_commands(readme))
            notebook_ins_list.append(notebook_commands(readme))
            code_ins_list.append(contains_runnable_code(readme))
        else:
            # print("repo not found")
            setup_files_list.append("Repo not found")
            docker_files_list.append("Repo not found")
            setup_info_list.append("Repo not found")
            shell_ins_list.append("Repo not found")
            notebook_ins_list.append("Repo not found")
            code_ins_list.append("Repo not found")


    row.append(item["github_urls"])
    row.append(setup_files_list)
    row.append(docker_files_list)
    row.append(setup_info_list)
    row.append(shell_ins_list)
    row.append(notebook_ins_list)
    row.append(code_ins_list)

    # print(row)

    df.loc[len(df)] = row

Error: Unable to fetch README (status code: 404) https://api.github.com/repos/mahi045/JMetal4/readme
Error: 404, Not Found
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/Moccino17/Transport_Mode_Sklearn/readme
Error: 404, Not Found
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/JusteRaimbault/EnergyPriceAt/readme
Error: 404, Not Found
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/jimioke/mitei-prototype-cities/readme
Error: 404, Not Found
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/giovanni-cal/future-transit/readme
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/Sangen-Hu/Bus-stop-Simulation/readme
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/AliMaurenne/Microeconomic-model-of-ridesourcing-/readme
Error: Unable to fetch README (status code: 404) https://api.github.com/repos/yudiaspen/sensor-bias-e

In [48]:
df.to_csv("readme_setup_checks.csv")