第一步 获取 GitHub 仓库信息（API入门）

In [None]:
import requests

# 仓库信息 API 地址
repo_api_url = 'https://api.github.com/repos/pandas-dev/pandas'

response = requests.get(repo_api_url, verify=False)
repo_data = response.json()

# 打印部分仓库信息
print("仓库名：", repo_data["name"])
print("所有者：", repo_data["owner"]["login"])
print("描述：", repo_data["description"])
print("主页：", repo_data["homepage"])
print("分叉数：", repo_data["forks_count"])
print("关注者数：", repo_data["stargazers_count"])

第二步 获取拉取请求（PR）信息

In [None]:
# PR列表 API 地址
pulls_api_url = 'https://api.github.com/repos/pandas-dev/pandas/pulls'

response = requests.get(pulls_api_url)
pulls_data = response.json()

# 打印前3个PR的标题、作者、状态
for pr in pulls_data[:3]:
    print("PR标题：", pr["title"])
    print("作者：", pr["user"]["login"])
    print("状态：", pr["state"])
    print("创建时间：", pr["created_at"])
    print("-" * 30)

第三步 获取PR详细信息（如提交次数、改动文件数）

In [None]:
pr_number = pulls_data[0]["number"]  # 取第一个PR的编号
pr_detail_url = f'https://api.github.com/repos/pandas-dev/pandas/pulls/{pr_number}'
pr_detail = requests.get(pr_detail_url).json()

print("提交次数：", pr_detail["commits"])
print("改动文件数：", pr_detail["changed_files"])
print("添加行数：", pr_detail["additions"])
print("删除行数：", pr_detail["deletions"])

第四步 获取用户信息

In [None]:
user_login = pulls_data[0]["user"]["login"]
user_url = f'https://api.github.com/users/{user_login}'
user_data = requests.get(user_url).json()

print("用户名：", user_data["login"])
print("仓库数：", user_data["public_repos"])
print("粉丝数：", user_data["followers"])
print("关注数：", user_data["following"])

第五步 面向对象组织数据（Class设计）

In [5]:
class Repository:
    def __init__(self, name, owner, description, homepage, forks, stars, license, date_of_collection):
        self.name = name
        self.owner = owner
        self.description = description
        self.homepage = homepage
        self.forks = forks
        self.stars = stars
        self.license = license
        self.date_of_collection = date_of_collection
        self.pull_requests = []

    def to_csv(self):
        return f"{self.owner},{self.name},{self.description},{self.homepage},{self.license},{self.forks},{self.stars},{self.date_of_collection}"

class PullRequest:
    def __init__(self, title, number, body, state, created_at, closed_at, user, commits, additions, deletions, changed_files):
        self.title = title
        self.number = number
        self.body = body
        self.state = state
        self.created_at = created_at
        self.closed_at = closed_at
        self.user = user
        self.commits = commits
        self.additions = additions
        self.deletions = deletions
        self.changed_files = changed_files

    def to_csv(self):
        return f"{self.number},{self.title},{self.state},{self.created_at},{self.closed_at},{self.user},{self.commits},{self.additions},{self.deletions},{self.changed_files}"

class User:
    def __init__(self, login, num_prs, public_repos, followers, following):
        self.login = login
        self.num_prs = num_prs
        self.public_repos = public_repos
        self.followers = followers
        self.following = following

    def to_csv(self):
        return f"{self.login},{self.num_prs},{self.public_repos},{self.followers},{self.following}"

第六步 保存数据到CSV文件

In [6]:
import csv
from datetime import datetime

repo_obj = Repository(
    name=repo_data["name"],
    owner=repo_data["owner"]["login"],
    description=repo_data["description"],
    homepage=repo_data["homepage"],
    forks=repo_data["forks_count"],
    stars=repo_data["stargazers_count"],
    license=repo_data["license"]["name"] if repo_data["license"] else "",
    date_of_collection=datetime.today().strftime('%Y-%m-%d')
)

# 写入CSV
with open('projects.csv', 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(repo_obj.to_csv().split(","))

第七步 数据分析和可视化

In [None]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False    # 显示负号

commits = []
headers = {"User-Agent": "Mozilla/5.0"}      # ① 定义 headers

for pr in pulls_data:
    pr_number = pr["number"]
    pr_detail_url = f"https://api.github.com/repos/pandas-dev/pandas/pulls/{pr_number}"
    pr_detail = requests.get(pr_detail_url, headers=headers).json()   # ② 加 headers
    if "commits" in pr_detail:
        commits.append(pr_detail["commits"])
print(commits)

plt.hist(commits, bins=10)
plt.title('PR提交次数分布')
plt.xlabel('提交次数')
plt.ylabel('PR数量')
plt.show()