# CREATING CUSTOM DATASET & FAISS

In [5]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [1]:
import requests

# issues from huggingface repo
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [3]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7438',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7438/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7438/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7438/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/7438',
  'id': 2899209484,
  'node_id': 'PR_kwDODunzps6Nk37h',
  'number': 7438,
  'title': 'Allow dataset row indexing with np.int types (#7423)',
  'user': {'login': 'DavidRConnell',
   'id': 35470740,
   'node_id': 'MDQ6VXNlcjM1NDcwNzQw',
   'avatar_url': 'https://avatars.githubusercontent.com/u/35470740?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/DavidRConnell',
   'html_url': 'https://github.com/DavidRConnell',
   'followers_url': 'https://api.github.com/users/DavidRConnell/followers'

In [None]:
# there is a limit of 60 requests per hour from unauthenticated sources
# so setting up github token

github_token = "Token"
headers = {"Authorization": f"token {github_token}"}

In [5]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,       # number of issues to fetch by default
    rate_limit=5_000,       # rate_limit on authorized token
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [6]:
# let's just get the first 5000 issues
fetch_issues(num_issues=5000)

  0%|          | 0/50 [00:00<?, ?it/s]

Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [6]:
# df = pd.read_json("datasets-issues.jsonl", lines=True)

In [66]:
from datasets import Dataset

data = Dataset.from_pandas(df)
data

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 5000
})

In [67]:
# difficult to load directly into huggingface datasets due to some missing values in timestamp

In [68]:
# In data we have a pull_request column which has PRs, which needs to be cleaned
sample = data.shuffle(seed=42).select(range(5))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/5944
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/5944.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/5944', 'merged_at': '2023-06-13T17:29:01Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/5944.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/5944'}

>> URL: https://github.com/huggingface/datasets/pull/6312
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/6312.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/6312', 'merged_at': '2023-10-19T16:23:07Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/6312.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/6312'}

>> URL: https://github.com/huggingface/datasets/issues/4051
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/issues/3599
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/is

In [69]:
# instead of dropping, let us add an extra column: keeping dataset as raw as possible
issues_dataset = data.map(lambda x: {"is_pull_request" : True if x["pull_request"] is not None else False} )

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [70]:
sample = issues_dataset.shuffle(1337).select(range(3))

for issue_number in sample["number"]:
    print(f">> Comments for issue #{issue_number}:")

>> Comments for issue #6560:
>> Comments for issue #4764:
>> Comments for issue #2857:


In [1]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
# get_comments(2857)

In [74]:
# adding a new column with comments
# issues_dataset = issues_dataset.map(
#     lambda x: {"comments": get_comments(x["number"])}
# )

In [54]:
issues_dataset.to_json("github_issues_with_comments.jsonl", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

41448148

### Now upload the dataset to the HuggingFace hub

## FAISS (Facebook AI Semantic Search)

In Transformer-based language models, each word or token in a piece of text is represented as an embedding vector—a numerical representation that captures its meaning and context. However, if we want to analyze entire sentences, paragraphs, or documents, we need a way to create a single vector that represents a larger span of text. This process is called pooling.

- Mean Pooling
- Max Pooling
- [CLS] Token Representation: Some models (like BERT) use a special [CLS] token at the beginning of the input and treat its embedding as the representation for the whole sequence.

Once we have vector representations of sentences or documents, we can compare them mathematically:
- Cosine similarity
- Euclidean Distance

We can see that there are a lot of columns in our dataset, most of which we don’t need to build our search engine. From a search perspective, the most informative columns are title, body, and comments, while html_url provides us with a link back to the source issue


We will learn something about **explode()** function in pandas
```bash
DataFrame.explode(column, ignore_index=True)
```
This will transform a column with a list of values into multiple rows of the dataframe
It returns a new data-frame

Here we need to explode the comments sections

In [22]:
# FAISS is not available presenly in pip
# use conda
# https://huggingface.co/learn/nlp-course/en/chapter5/6?fw=pt        -- read docs to understand faiss