In [1]:
import time
from urllib.request import urlopen
from urllib.request import Request
import urllib.error
import json
import sys

def get_last_recorded_repo(file_path):
    """从文件中读取最后一行的仓库数据"""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
            if lines:
                count = len(lines)
                last_line = lines[-1].strip()
                repo_data = last_line.split(',')
                repo_full_name = repo_data[1]
                
                # 使用 /repos/{owner}/{repo} 获取仓库信息
                url = f'https://api.github.com/repos/{repo_full_name}'
                req = Request(url)
                response = urlopen(req).read()
                result = json.loads(response.decode())
                min_stars = result['stargazers_count']
                
                search = f'stars:<{min_stars}'
                return count + 1, search
            else:
                return 1, 'stars:>100'  # 初始状态，未找到记录
    except FileNotFoundError:
        return 1, 'stars:>100'  # 文件不存在时从初始状态开始

    
def get_results(search, headers, page):
    url = f'https://api.github.com/search/repositories?q={search}&page={page}&per_page=100&sort=stars&order=desc'
    req = Request(url, headers=headers)
    response = urlopen(req).read()
    result = json.loads(response.decode())
    return result


def get_rate_limit(headers):
    url = 'https://api.github.com/rate_limit'
    req = Request(url, headers=headers)
    response = urlopen(req).read()
    result = json.loads(response.decode())

    remaining_core = result['resources']['core']['remaining']
    reset_core = result['resources']['core']['reset']
    remaining_search = result['resources']['search']['remaining']
    reset_search = result['resources']['search']['reset']

    return remaining_core, reset_core, remaining_search, reset_search

def update_count_display(count):
    """在控制台同一行实时打印count"""
    sys.stdout.write(f'\rCurrent count: {count}')  # 使用 \r 覆盖当前行
    sys.stdout.flush()  # 立即刷新输出，不要等到换行符

if __name__ == '__main__':
    search = 'stars:>100'  # 初始条件
    min_stars = 1000000  # 假设初始最大 stars 数值
    total_repos = 1000000  # 设置需要获取的仓库总数

    # 修改 GitHub token 值
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Authorization': 'token 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',
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }

    count, search = get_last_recorded_repo("Repos.txt")

    while count <= total_repos:
        repos_list = []
        stars_list = []

        for page in range(1, 11):
            remaining_core, reset_core, remaining_search, reset_search = get_rate_limit(headers)

            if remaining_search < 1:
                sleep_time = reset_search - time.time() + 1
                print(f'Search rate limit exceeded. Sleeping for {sleep_time} seconds.')
                time.sleep(sleep_time)

            if remaining_core < 1:
                sleep_time = reset_core - time.time() + 1
                print(f'Core rate limit exceeded. Sleeping for {sleep_time} seconds.')
                time.sleep(sleep_time)

            try:
                results = get_results(search, headers, page)
                for item in results['items']:
                    repos_list.append([count, item["full_name"], item["clone_url"]])
                    stars_list.append(item["stargazers_count"])  # 记录 stars 数量
                    count += 1
                    if count > total_repos:
                        break

                update_count_display(count)
                if count > total_repos:
                    break
            except urllib.error.HTTPError as e:
                if e.code == 403:
                    print(f'Error fetching results on page {page}: Rate limit reached (HTTP 403).')
                    sleep_time = reset_search - time.time() + 1
                    print(f'Sleeping for {sleep_time} seconds until rate limit resets.')
                    time.sleep(sleep_time)
                elif e.code == 422:
                    print(f'Error fetching results on page {page}: Unprocessable Entity (HTTP 422). Check the search query.')
                    break
                else:
                    print(f'Error fetching results on page {page}: {e}')
                    break


        min_stars = stars_list[-1]  # 取该批次中最小的 stars 值

        # 更新搜索条件，确保获取的是 stars 小于当前批次最小 stars 的仓库
        search = f'stars:<{min_stars}'
        print(f'Next iteration will fetch repositories with stars less than {min_stars}')

        with open("Repos.txt", "a", encoding="utf-8") as f:
            for repo in repos_list:
                f.write(f'{repo[0]},{repo[1]},{repo[2]}\n')
                f.flush()  # 确保数据立即写入磁盘


        # 避免触发速率限制
        time.sleep(0.1)



Current count: 3001Error fetching results on page 5: Rate limit reached (HTTP 403).
Sleeping for 42.25113010406494 seconds until rate limit resets.
Current count: 3401Error fetching results on page 10: Rate limit reached (HTTP 403).
Sleeping for 44.22914481163025 seconds until rate limit resets.
Next iteration will fetch repositories with stars less than 1671
Current count: 3801Error fetching results on page 5: Rate limit reached (HTTP 403).
Sleeping for 43.15170073509216 seconds until rate limit resets.
Current count: 4201Error fetching results on page 10: Rate limit reached (HTTP 403).
Sleeping for 43.31967544555664 seconds until rate limit resets.
Next iteration will fetch repositories with stars less than 1430
Current count: 4601Error fetching results on page 5: Rate limit reached (HTTP 403).
Sleeping for 43.500046491622925 seconds until rate limit resets.
Current count: 5001Error fetching results on page 10: Rate limit reached (HTTP 403).
Sleeping for 42.376211404800415 seconds un

IndexError: list index out of range