In [None]:
import pandas as pd
import requests
from datetime import datetime, timedelta

## Scrape GitHub Issues
Looking for GPT links inside issue threads, accross all GitHub starting from 2023 to April 2024

In [None]:
def search_github_issues(search_string, token, start_date, end_date):
    url = "https://api.github.com/search/issues"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }

    issues = []
    page = 1
    while True:
        params = {
            "q": f"{search_string} created:{start_date}..{end_date}",
            "page": page,
            "per_page": 100
        }
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print("Error occurred:", response.json())
            return 'Error'

        results = response.json()
        # print(results)
        # break
        if not results or results.get("items") is None:
            break

        for issue in results["items"]:
            # Get additional info for each issue
            labels = [label["name"] for label in issue["labels"]]
            issue_details = {
                "title": issue["title"],
                "html_url": issue["html_url"],
                "api_url": issue["url"],
                "repository_url": issue["repository_url"],
                "comments_url": issue['comments_url'],
                "state_reason": issue['state_reason'],
                "state": issue["state"],
                "number_of_comments": issue["comments"],
                "body": issue["body"],
                "user": issue["user"]["login"],
                "labels": labels,
            }
            issues.append(issue_details)

        if len(results["items"]) < 100:  # Check against the per_page limit
            break

        page += 1
        # print(page)

    return issues

In [None]:
# Search string
search_string_gpt = "chat.openai.com/share"
token = "#"

# Your GitHub personal access token
github_token = token

# Date ranges to split the search
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 4, 30)
delta = timedelta(days=1)  # Split into 30-day intervals

trend_of_sharing_gpt = {}
trend_of_sharing_stackoverflow = {}

# Retrieve the issues for each date range
issues = []
while start_date <= end_date:
    next_date = start_date + delta
    if next_date > end_date:
        next_date = end_date
    
    # Format dates in ISO format
    start_date_str = start_date.strftime("%Y-%m-%d")
    next_date_str = next_date.strftime("%Y-%m-%d")

    # Search for issues within the current date range
    print(start_date_str + ' ' + next_date_str)
    issues_in_range = search_github_issues(search_string_gpt, github_token, start_date_str, next_date_str)
    if issues_in_range == 'Error':
        print(start_date)
        break
    trend_of_sharing_stackoverflow[start_date_str + ' ' + next_date_str] = len(issues_in_range)
    issues.extend(issues_in_range)

    start_date = next_date