In [22]:
import requests
import base64
import re
import os

In [None]:
def check_keywords(readme_text, keywords):
    """Check for presence of keywords in README."""

    keyword_matches = {keyword: re.search(rf"\b{keyword}\b", readme_text, re.IGNORECASE) 
                       for keyword in keywords}
    return {kw: match.group(0) for kw, match in keyword_matches.items() if match}

def readme_not_empty(readme):
    """
    Returns true if readme has more than 150 characters (since github default adds title)
    """
    return len(readme) > 150

def data_location(readme):
    """
    Check if readme states where the data can be found
    """

    keywords = [
        "data", "dataset", "data source", 
        "input data", "training data", "data link", "data directory", 
        "data location", "data path", "data folder", "data archive"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def enviroment_setup(readme):
    """
    Check if readme includes how to setup the environment / dependencies
    """
    keywords = [
        # General setup terms
        "requirements", "dependencies", "environment", "install", "setup", 
        "virtualenv", "pip", "docker", "build", "configuration",
        "environment.yaml", "requirements.txt", 

        # MATLAB-specific terms
        "MATLAB", "matlabpath", "toolbox", "mex", "matlab script", "matlab install",

        # C/C++-specific terms
        "makefile", "gcc", "g\+\+", "cmake", "make", "compile", "C compiler", "C\+\+ compiler",

        # Java-specific terms
        "JDK", "Maven", "Gradle", "java -jar", "javac", "java version", "Java SDK",

        # R-specific terms
        "R package", "CRAN", "install.packages", "Rscript", "R environment", "R version"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False


def commands(readme):
    """
    Check if readme includes commands that can be run to reproduce data
    """
    keywords = [
        "run", "execute", "executable", "command", "reproduce", "steps", "usage", 
        "experiment", "reproduction", "how to run", "command line", 
        "terminal", "CLI", "script", "bash", "shell", "notebook", "ipynb", "notebooks"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def parameters(readme):
    """
    Check if readme includes parameters that can be run to reproduce data

    Checks for 2 things:
    1 - keywords about parameters
    2 - anything of the format --[something]=[something]
    """

    keywords = ["parameters", "hyperparameters", "configuration", "settings", "args", "arguments", "--[a-zA-Z0-9_-]+=[a-zA-Z0-9_-]+"]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

check_functions = {
    "readme not empty": readme_not_empty,
    "environment setup": enviroment_setup,
    "data location": data_location,
    "commands": commands,
    "parameters": parameters
    # ADD NEW FUNCTIONS HERE
}
    

In [27]:
check_repos = {}

for check in check_functions:
    check_repos[check] = []

# Define the path to the repos library
repos_path = './repos'

valid_readmes = 0

readmes = {}

# Loop over each folder in the repos library
for folder_name in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder_name)

    # Ensure we're only looking at directories
    if os.path.isdir(folder_path):
        code_path = os.path.join(folder_path, 'code')

        # Check if the 'code' folder exists within each repo folder
        if os.path.isdir(code_path):
            # Loop over files in the 'code' folder
            for file_name in os.listdir(code_path):
                # Check if the file name contains "readme" (case insensitive)
                if 'readme' in file_name.lower():
                    file_path = os.path.join(code_path, file_name)

                    # Ensure it's a file and read its contents
                    if os.path.isfile(file_path):
                        try:
                            with open(file_path, 'r', encoding='utf-8') as file:
                                contents = file.read()

                                print(folder_name)
                                for check in check_functions:
                                    #print(check)
                                    if check_functions[check](contents):
                                        #print(check, folder_name)
                                        check_repos[check].append(folder_name)
                            valid_readmes+=1
                            readmes[folder_name] = contents
                                
                        except Exception as er:
                            print("OOPS 1")
                            print(er)
                            print("OOPS 2")


capsule-4600160
capsule-2916503
capsule-5496369
capsule-4807644
capsule-5777882
capsule-7156696
capsule-1683542
capsule-0940461
capsule-5367566
capsule-6746514
capsule-0201225
capsule-0220918
OOPS 1
'utf-8' codec can't decode byte 0xb5 in position 11: invalid start byte
OOPS 2
capsule-5286757
capsule-9370340
capsule-4645832
capsule-3497606
capsule-1906954
capsule-3272782
capsule-2011424
capsule-1108125
capsule-0325493
OOPS 1
'utf-8' codec can't decode byte 0xd0 in position 10: invalid continuation byte
OOPS 2
capsule-0238624
capsule-1324693
capsule-9348218
capsule-2061060
capsule-9070543
capsule-3269870
capsule-6460826
capsule-4098236
capsule-7935517


In [31]:
for c in check_repos:
    print(c)
    print(check_repos[c])
    print(len(check_repos[c]))

readme not empty
['capsule-4600160', 'capsule-2916503', 'capsule-5496369', 'capsule-4807644', 'capsule-5777882', 'capsule-7156696', 'capsule-1683542', 'capsule-0940461', 'capsule-5367566', 'capsule-6746514', 'capsule-0201225', 'capsule-0220918', 'capsule-5286757', 'capsule-9370340', 'capsule-4645832', 'capsule-3497606', 'capsule-1906954', 'capsule-3272782', 'capsule-2011424', 'capsule-1108125', 'capsule-0325493', 'capsule-0238624', 'capsule-1324693', 'capsule-9348218', 'capsule-2061060', 'capsule-9070543', 'capsule-3269870', 'capsule-6460826', 'capsule-4098236', 'capsule-7935517']
30
environment setup
['capsule-4600160', 'capsule-2916503', 'capsule-4807644', 'capsule-7156696', 'capsule-0220918', 'capsule-1906954', 'capsule-3272782', 'capsule-2011424', 'capsule-0325493', 'capsule-0238624', 'capsule-9348218', 'capsule-2061060', 'capsule-9070543', 'capsule-4098236', 'capsule-7935517']
15
data location
['capsule-4600160', 'capsule-2916503', 'capsule-4807644', 'capsule-5777882', 'capsule-71