In [1]:
!pip install datasets
!pip install transformers
!pip install zstandard



## Getting the data


In [2]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6237',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6237/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6237/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6237/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/6237',
  'id': 1893822321,
  'node_id': 'I_kwDODunzps5w4W9x',
  'number': 6237,
  'title': 'Tokenization with multiple workers is too slow',
  'user': {'login': 'macabdul9',
   'id': 25720695,
   'node_id': 'MDQ6VXNlcjI1NzIwNjk1',
   'avatar_url': 'https://avatars.githubusercontent.com/u/25720695?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/macabdul9',
   'html_url': 'https://github.com/macabdul9',
   'followers_url': 'https://api.github.com/users/macabdul9/followers',
   'following_url':

In [5]:
GITHUB_TOKEN = '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'
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [6]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

def fetch_issues(owner="huggingface", repo="datasets", num_issues=10000, rate_limit=5000, issues_path=Path(".")):
  if not issues_path.is_dir():
    issues_path.mkdir(exist_ok=True)

  batch = []
  all_issues = []
  per_page = 100 # Number of issues to return per page
  num_pages = math.ceil(num_issues / per_page)
  base_url = "https://api.github.com/repos"

  for page in tqdm(range(num_pages)):
    # Query with state=all to get both open and closed issues
    query = f"issues?page={page}&per_page={per_page}&state=all"
    issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
    batch.extend(issues.json())

    if len(batch)  > rate_limit and len(all_issues) < num_issues:
      all_issues.extend(batch)
      batch = [] # Flush batch for next time period
      print(f"Reached GitHub rate limit. Sleeping for one hour ...")
      time.sleep(60 * 60 + 1)

  all_issues.extend(batch)
  df = pd.DataFrame.from_records(all_issues)
  df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
  print(
      f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
  )

In [None]:
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...


In [None]:
from datasets import load_dataset

issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

## Cleaning up the data


In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

In [None]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

## Augmenting the dataset


In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)

In [None]:
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)