Set up PyGithub

In [1]:
%pip install PyGithub requests


Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [None]:
import os
from github import Github

os.environ["GITHUB_API_KEY"] = "GITHUB_API_KEY_HERE"
# Set up your GitHub API token here
GITHUB_TOKEN = os.getenv('GITHUB_API_KEY')  # Set this environment variable before running the script
if not GITHUB_TOKEN:
    raise EnvironmentError("Please set the GITHUB_TOKEN environment variable with your GitHub API token.")

# Initialize GitHub API client
g = Github(GITHUB_TOKEN)

In [3]:
import requests
# Function to download a file from a GitHub repository
def download_file(url, output_path):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(output_path, 'wb') as file:
                file.write(response.content)
                print(f"Downloaded: {output_path}")
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

In [4]:
import re
# Function to extract short functions from C++ code
def extract_short_functions(file_path):
    short_functions = []
    with open(file_path, 'r') as file:
        code = file.read()
        # Use regex to find function definitions
        functions = re.findall(r'[\w\s\*&]+?\s+\w+\s*\([^)]*\)\s*{[^}]*}', code, re.DOTALL)
        for func in functions:
            # Split into lines and check if it's 10 or fewer lines
            lines = func.strip().splitlines()
            if len(lines) <= 10:
                short_functions.append(func.strip())
    return short_functions

In [11]:
# Function to search for C++ repositories and download the files
def download_cpp_code(search_query, max_repos, max_files_per_repo, output_dir):
    # Search for repositories matching the search query
    repos = g.search_repositories(query=SEARCH_QUERY)
    count = 0

    for repo in repos:
        if count >= MAX_REPOS:
            break

        print(f"Cloning repository: {repo.full_name}")
        repo_dir = os.path.join(OUTPUT_DIR, repo.name)
        os.makedirs(repo_dir, exist_ok=True)

        try:
            # Get all C++ files in the repository
            contents = repo.get_contents("")
            file_count = 0
            while contents and file_count < max_files_per_repo:
                file_content = contents.pop(0)
                if file_content.type == "dir":
                    contents.extend(repo.get_contents(file_content.path))
                elif file_content.name.endswith(('.cpp', '.h', '.hpp', '.cc')):
                    # Download the C++ file
                    file_url = file_content.download_url
                    if file_url:
                        output_path = os.path.join(repo_dir, file_content.name)
                        download_file(file_url, output_path)
                        # Extract and save short functions
                        short_funcs = extract_short_functions(output_path)
                        if short_funcs:
                            with open(os.path.join(repo_dir, f'short_functions_{file_content.name}'), 'w') as f:
                                f.write('\n\n'.join(short_funcs))
                                print(f"Extracted {len(short_funcs)} short functions from {file_content.name}.")
                                file_count += 1
        except Exception as e:
            print(f"Error processing repository {repo.full_name}: {e}")
            continue

        count += 1

In [14]:
# Parameters for the search
SEARCH_QUERY = 'language:cpp stars:>1000'  # Adjust the query to match your requirements
MAX_REPOS = 10  # Number of repositories to download from
MAX_FILES_PER_REPO = 10
OUTPUT_DIR = './cpp_code_1'  # Directory to save the downloaded C++ files

# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [15]:
download_cpp_code(SEARCH_QUERY, MAX_REPOS, MAX_FILES_PER_REPO, OUTPUT_DIR)

Cloning repository: tensorflow/tensorflow
Downloaded: ./cpp_code_1/tensorflow/c_api.cc
Extracted 186 short functions from c_api.cc.
Downloaded: ./cpp_code_1/tensorflow/c_api.h
Downloaded: ./cpp_code_1/tensorflow/c_api_experimental.cc
Extracted 53 short functions from c_api_experimental.cc.
Downloaded: ./cpp_code_1/tensorflow/c_api_experimental.h
Downloaded: ./cpp_code_1/tensorflow/c_api_experimental_test.cc
Extracted 6 short functions from c_api_experimental_test.cc.
Downloaded: ./cpp_code_1/tensorflow/c_api_function.cc
Extracted 15 short functions from c_api_function.cc.
Downloaded: ./cpp_code_1/tensorflow/c_api_function_test.cc
Extracted 27 short functions from c_api_function_test.cc.
Downloaded: ./cpp_code_1/tensorflow/c_api_internal.h
Extracted 2 short functions from c_api_internal.h.
Downloaded: ./cpp_code_1/tensorflow/c_api_macros.h
Downloaded: ./cpp_code_1/tensorflow/c_api_macros_internal.h
Extracted 2 short functions from c_api_macros_internal.h.
Downloaded: ./cpp_code_1/tensor

In [18]:
!zip -r /content/code.zip /content/cpp_code_1
from google.colab import files
files.download("/content/code.zip")

  adding: content/cpp_code_1/ (stored 0%)
  adding: content/cpp_code_1/tensorflow/ (stored 0%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_api_experimental.cc (deflated 75%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_test_util.cc (deflated 81%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_api_macros_internal.h (deflated 67%)
  adding: content/cpp_code_1/tensorflow/c_api_test.cc (deflated 81%)
  adding: content/cpp_code_1/tensorflow/c_api_experimental.h (deflated 70%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_api_function_test.cc (deflated 69%)
  adding: content/cpp_code_1/tensorflow/c_api_macros_internal.h (deflated 62%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_api.cc (deflated 80%)
  adding: content/cpp_code_1/tensorflow/c_api.h (deflated 78%)
  adding: content/cpp_code_1/tensorflow/short_functions_c_test_util.h (deflated 24%)
  adding: content/cpp_code_1/tensorflow/c_api_internal.h (deflated 65%)
  adding: con

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>