## Install dependencies

In [1]:
!pip install PyGithub -q
!pip install nbformat -q

# Get all repos

Replace `GH_ACCESS_TOKEN` with your GitHub access token
- You may want to go ahead without `GH_ACCESS_TOKEN` set. In this case, 60 requests / hour is the rate limit. 
- With `GH_ACCESS_TOKEN` set, the rate limit is 5000 requests / hour.
- [Reference](https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits-for-requests-from-personal-accounts)

In [2]:
from github import Github


def get_repos(username, access_token=None, include_fork=False):
    """Fetches repositories for a particular GitHub user."""
    g = Github(access_token)
    user = g.get_user(username)

    results = []
    for repo in user.get_repos():
        if repo.fork is False:
            results.append(repo)
        else:
            if include_fork is True:
                results.append(repo)

    return results

In [3]:
USERNAME = "sayakpaul"  # @param ["sayakpaul", "deep-diver"]
GH_ACCESS_TOKEN = ""

repos = get_repos(USERNAME, GH_ACCESS_TOKEN)

In [4]:
from pprint import pprint

print(len(repos))
pprint(repos)

138
[Repository(full_name="sayakpaul/A-B-testing-with-Machine-Learning"),
 Repository(full_name="sayakpaul/A-Barebones-Image-Retrieval-System"),
 Repository(full_name="sayakpaul/Action-Recognition-in-TensorFlow"),
 Repository(full_name="sayakpaul/AdaMatch-TF"),
 Repository(full_name="sayakpaul/Adaptive-Gradient-Clipping"),
 Repository(full_name="sayakpaul/Adventures-in-TensorFlow-Lite"),
 Repository(full_name="sayakpaul/Adversarial-Examples-in-Deep-Learning"),
 Repository(full_name="sayakpaul/Aerial-Cactus-Identification"),
 Repository(full_name="sayakpaul/Age-Detection-of-Indian-Actors"),
 Repository(full_name="sayakpaul/Age-Detector-API"),
 Repository(full_name="sayakpaul/Age-Detector-Web-App"),
 Repository(full_name="sayakpaul/Anagram"),
 Repository(full_name="sayakpaul/Analysis-of-college-database-of-2017-passouts"),
 Repository(full_name="sayakpaul/Analytics-Vidhya-Game-of-Deep-Learning-Hackathon"),
 Repository(full_name="sayakpaul/Applied-Data-Science-w-Python-Specialization"),
 

# Extract source codes and save in CSV

The following code snippet works in the following manner:

1. Get list of files (*.py and *.ipynb) in the target repositories (`target_repos`)
2. Grasp the content of each file and decode it with `base64`
  - for `*.py`, plain text will be extracted
  - for `*.ipynb`, the contents of all the code cells will be extracted and merged as a single string (we will use a heuristic to filter the code cell involving shell commands and others)
3. Create a `pd.DataFrame` of `["reponame", "filepath", "content"]` column to store repository, filepath, and the extracted content
4. Iterate 1 ~ 3 steps for all target repositories, and append DataFrame to `df` which contains all records

In [5]:
target_repos = {
    "sayakpaul": [
        "Dual-Deployments-on-Vertex-AI",
        "Dual-Deployments-on-Vertex-AI",
        "CI-CD-for-Model-Training",
    ],
    "deep-diver": [
        "Continuous-Adaptation-for-Machine-Learning-System-to-Data-Changes",
        "semantic-segmentation-ml-pipeline",
        "mlops-hf-tf-vision-models",
    ],
}[USERNAME]

In [6]:
import base64
import pandas as pd
from nbformat import reads, NO_CONVERT
from github import GithubException

df = pd.DataFrame(columns=["reponame", "filepath", "content"])


def get_py_files(repo, file_list, path="."):
    """Fetches Python scripts and the notebooks for a given repo."""
    contents = repo.get_contents("")
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            if file_content.name.endswith("py"):
                file_list.append(file_content)
            elif file_content.name.endswith("ipynb"):
                file_list.append(file_content)


def filter_code_cell(cell):
    """Filters a code cell w.r.t shell commands, etc."""
    only_shell = cell["source"].startswith("!")
    only_magic = "%%capture" in cell["source"]
    if only_shell or only_magic:
        return False
    else:
        return True


for repo in repos:
    if repo.name in target_repos:
        file_list = []
        get_py_files(repo, file_list)

        if len(file_list) != 0:
            for file in file_list:
                if file.name.endswith("py"):
                    content = file.content
                    content_str = base64.b64decode(content).decode("utf-8")

                    if content != "":
                        df = pd.concat(
                            [
                                df,
                                pd.DataFrame.from_dict(
                                    [
                                        {
                                            "reponame": f"{USERNAME}/{repo.name}",
                                            "filepath": file.path,
                                            "content": content_str,
                                        }
                                    ]
                                ),
                            ]
                        )
                elif file.name.endswith("ipynb"):
                    content = file.content
                    content_str = base64.b64decode(content).decode("utf-8")

                    code_cell_str = ""
                    notebook = reads(content_str, NO_CONVERT)

                    code_cells = [
                        c
                        for c in notebook["cells"]
                        if c["cell_type"] == "code"
                        if filter_code_cell(c)
                    ]

                    for cell in code_cells:
                        code_cell_str += cell["source"]

                    if content != "":
                        df = pd.concat(
                            [
                                df,
                                pd.DataFrame.from_dict(
                                    [
                                        {
                                            "reponame": f"{USERNAME}/{repo.name}",
                                            "filepath": file.path,
                                            "content": code_cell_str,
                                        }
                                    ]
                                ),
                            ]
                        )

  validate(nb)


In [7]:
df.head()

Unnamed: 0,reponame,filepath,content
0,sayakpaul/CI-CD-for-Model-Training,cloud_build_tfx.ipynb,from google.colab import auth\nauth.authentica...
0,sayakpaul/CI-CD-for-Model-Training,cloud_function_trigger.ipynb,from google.colab import auth\nauth.authentica...
0,sayakpaul/CI-CD-for-Model-Training,cloud_scheduler_trigger.ipynb,# only need if you are using Colab\nfrom googl...
0,sayakpaul/CI-CD-for-Model-Training,build/compile_pipeline.py,import argparse\n\nfrom absl import logging\nf...
0,sayakpaul/CI-CD-for-Model-Training,build/create_pipeline.py,from tfx.orchestration import data_types\nfrom...


### Save the resuling `DataFrame` to CSV

In [8]:
df.to_csv(f"{USERNAME}.csv")

## Prepare a ü§ó Datasets compatible dataset

In [9]:
!pip install -q datasets 

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m474.6/474.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m110.5/110.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m212.5/212.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m134.3/134.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.0/1.0 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K     [

In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['reponame', 'filepath', 'content', '__index_level_0__'],
    num_rows: 18
})

In [12]:
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['reponame', 'filepath', 'content'],
    num_rows: 18
})

In [13]:
dataset[13]

{'reponame': 'sayakpaul/Dual-Deployments-on-Vertex-AI',
 'filepath': 'custom_components/vertex_uploader.py',
 'content': '"""\nCustom TFX component for importing a model into Vertex AI.\nAuthor: Sayak Paul\nReference: https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai/blob/main/src/tfx_pipelines/components.py#L74\n"""\n\nimport os\nimport tensorflow as tf\n\nfrom tfx.dsl.component.experimental.decorators import component\nfrom tfx.dsl.component.experimental.annotations import Parameter\nfrom tfx.types.standard_artifacts import String\nfrom google.cloud import aiplatform as vertex_ai\nfrom tfx import v1 as tfx\nfrom absl import logging\n\n\n@component\ndef VertexUploader(\n    project: Parameter[str],\n    region: Parameter[str],\n    model_display_name: Parameter[str],\n    pushed_model_location: Parameter[str],\n    serving_image_uri: Parameter[str],\n    uploaded_model: tfx.dsl.components.OutputArtifact[String],\n):\n\n    vertex_ai.init(project=project, location=region)\n\n

In [None]:
!huggingface-cli login

In [15]:
dataset.push_to_hub("coding-assistant-custom/dummy-dataset-sayak")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]