In [29]:
import requests
import pandas as pd
# 设置GitHub访问令牌
access_token = ''
headers = {'Authorization': f'token {access_token}'}
repo_info = {
    "Repository Name": [],
}
# GitHub API URL
url = 'https://api.github.com/search/repositories'
for page in range(1, 11):
    # 设置搜索参数（按星标数量排序）
    params = {
        'q': 'stars:<476',  # 选择星标超过1000的仓库
        'sort': 'stars',
        'order': 'desc',
        'per_page': 100,  # 每页结果数（最多100）
        'page': page  # 页码
    }
    
    # 发起GET请求
    response = requests.get(url, headers=headers, params=params)
    
    # 检查响应状态码
    if response.status_code == 200:
        data = response.json()
        repositories = data['items']
    
        # 打印热门仓库信息
        for repo in repositories:
            repo_info["Repository Name"].append(repo['full_name'])
            print(f"Name: {repo['full_name']}")
    else:
        print(f"Failed to fetch repositories: {response.status_code}")
        
# 将字典转换为 DataFrame
df = pd.DataFrame.from_dict(repo_info, orient='index').transpose()

# 将 DataFrame 保存为 CSV 文件
df.to_csv('repo_name10.csv', index=False)

Name: wycats/moneta
Name: mashaal/wild-cherry
Name: clayallsopp/routable-android
Name: jayrambhia/CropperNoCropper
Name: robrix/RXCollections
Name: horizon3ai/vcenter_saml_login
Name: keymetrics/docker-pm2
Name: tinystacks/precloud
Name: dxxzst/OfflineMap
Name: bryandlee/FreezeG
Name: skydoves/ColorPickerPreference
Name: oxc-project/javascript-parser-in-rust
Name: nslogx/weapp-poem
Name: grepplabs/kafka-proxy
Name: forscht/ddrive
Name: Kirilllive/tuesday-js
Name: ibireme/YYDispatchQueuePool
Name: allynbauer/statuspanic
Name: nessos/LinqOptimizer
Name: jonataslaw/get_server
Name: alvyxaz/barebones-masterserver
Name: nwutils/nw-updater
Name: rashevskyv/kefir
Name: wangdongdut/Online-Visual-Tracking-SOTA
Name: XifengGuo/DEC-keras
Name: Lyfhael/DeleteTweets
Name: gucong3000/mirror-config-china
Name: mcthesw/game-save-manager
Name: egoist/vue-client-only
Name: keremciu/font-bundles
Name: bradfitz/embiggen-disk
Name: hongyangAndroid/ColorfulStatusBar
Name: lionsoul2014/friso
Name: eclipse-ar

In [30]:
from github import Github
import pandas as pd

# 用你的 GitHub 访问令牌替换
ACCESS_TOKEN = ''

# 初始化 Github 客户端
g = Github(ACCESS_TOKEN)

# 替换为你想要获取信息的仓库名称列表
df = pd.read_csv('repo_name10.csv')
REPO_LIST = df['Repository Name'].tolist()

# 创建一个列表存储所有仓库的信息
repo_data = []

# 获取仓库描述
def get_repo_description(repo):
    return repo.description

# 获取代码内容
def get_repo_contents(repo, path=""):
    contents = repo.get_contents(path)
    code_contents = []
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            code_contents.append(f"File: {file_content.path}")
    return "\n".join(code_contents)

# 获取前5条提交历史
def get_commit_history(repo, limit=3):
    commits = repo.get_commits()[:limit]
    commit_history = []
    for commit in commits:
        commit_history.append(f"Author: {commit.commit.author.name}")
        commit_history.append(f"Date: {commit.commit.author.date}")
        commit_history.append(f"Message: {commit.commit.message}")
    return "\n".join(commit_history)

# 获取README文件
def get_readme(repo):
    try:
        readme = repo.get_readme()
        return readme.decoded_content.decode()
    except:
        return "No README found"

# 获取5条标签信息
def get_tags(repo):
    tags = repo.get_tags()
    if tags.totalCount == 0:
        return "No tags found"
    tag=tags[0]
    tag_info = []
    tag_info.append(f"Tag: {tag.name}")
    try:
        tag_ref = repo.get_git_tag(tag.commit.sha)
        tag_info.append(f"Message: {tag_ref.message}")
        tag_info.append(f"Tagger: {tag_ref.tagger.name}")
        tag_info.append(f"Date: {tag_ref.tagger.date}")
    except:
        tag_info.append("No additional tag information")
    return "\n".join(tag_info)

# 获取标签信息
def get_topics(repo):
    if repo.get_topics():
        return repo.get_topics()[0:10]
    else :
        return 'No topics'

# 处理每个仓库
for repo_name in REPO_LIST:
    print(f"begin getting {repo_name}")
    repo = g.get_repo(repo_name)
    repo_info = {
        "Repository Name": repo_name,
        "Description": get_repo_description(repo),
        "Code Content": get_repo_contents(repo),
        "Commit History": get_commit_history(repo),
        "README Content": get_readme(repo),
        "Tags": get_tags(repo),
        "Topics": get_topics(repo)
    }
    # repo_data.append(repo_info)
    # 将数据转换为 DataFrame
    
    new_data_list = {key: [value] for key, value in repo_info.items()}
    df = pd.DataFrame(new_data_list)
    df.to_csv('repos_info.csv', mode='a', header=False, index=False)

print("Information saved to repos_info.csv")


begin getting wycats/moneta
begin getting mashaal/wild-cherry
begin getting clayallsopp/routable-android
begin getting jayrambhia/CropperNoCropper
begin getting robrix/RXCollections
begin getting horizon3ai/vcenter_saml_login
begin getting keymetrics/docker-pm2
begin getting tinystacks/precloud
begin getting dxxzst/OfflineMap
begin getting bryandlee/FreezeG
begin getting skydoves/ColorPickerPreference
begin getting oxc-project/javascript-parser-in-rust
begin getting nslogx/weapp-poem
begin getting grepplabs/kafka-proxy
begin getting forscht/ddrive
begin getting Kirilllive/tuesday-js
begin getting ibireme/YYDispatchQueuePool
begin getting allynbauer/statuspanic
begin getting nessos/LinqOptimizer
begin getting jonataslaw/get_server
begin getting alvyxaz/barebones-masterserver
begin getting nwutils/nw-updater
begin getting rashevskyv/kefir
begin getting wangdongdut/Online-Visual-Tracking-SOTA
begin getting XifengGuo/DEC-keras
begin getting Lyfhael/DeleteTweets
begin getting gucong3000/mir

Request GET /repos/Grokmoo/sulis/contents/data/sounds/sfx/sfx_100 failed with 403: Forbidden
Setting next backoff to 94.212673s


begin getting spacemonkeygo/openssl
begin getting kulpreetchilana/Custom-iOS-Keyboards
begin getting sachinkesiraju/SKSplashView
begin getting humanwhocodes/humanfs
begin getting kendo-labs/angular-kendo
begin getting akanazawa/cmr
begin getting PerseveranceZ/vue-develop-template
begin getting bcrusco/Forward-Plus-Renderer
begin getting wsl2ls/WKWebView
begin getting troyzhxu/okhttps
begin getting wiidev/usbloadergx
begin getting jefflai108/Contrastive-Predictive-Coding-PyTorch
begin getting mwgg/Airports
begin getting quantsbin/Quantsbin
begin getting Reginer/aosp-android-jar


Request GET /repos/Reginer/aosp-android-jar/contents/android-32/src/com/android/server/display/whitebalance failed with 403: Forbidden
Setting next backoff to 220.550838s


In [2]:
import pandas as pd
df = pd.read_csv('repo_allbut1.csv')
REPO_LIST = df['Repository Name'].tolist()

In [3]:
REPO_LIST

['hiroi-sora/Umi-OCR',
 'MostlyAdequate/mostly-adequate-guide',
 'StreisandEffect/streisand',
 'youzan/vant',
 'gfwlist/gfwlist',
 'electron-react-boilerplate/electron-react-boilerplate',
 'kovidgoyal/kitty',
 'gocolly/colly',
 'responsively-org/responsively-app',
 'HeroTransitions/Hero',
 'Swordfish90/cool-retro-term',
 'opentofu/opentofu',
 'dylanaraps/neofetch',
 'pugjs/pug',
 'dandavison/delta',
 'julycoding/The-Art-Of-Programming-By-July-2nd',
 'typicode/lowdb',
 'ColorlibHQ/gentelella',
 'duckdb/duckdb',
 'pubkey/rxdb',
 'FelisCatus/SwitchyOmega',
 'emilk/egui',
 'JohnCoates/Aerial',
 'ajeetdsouza/zoxide',
 'petkaantonov/bluebird',
 'keepassxreboot/keepassxc',
 'graphql/graphql-js',
 'Bin-Huang/chatbox',
 'ungoogled-software/ungoogled-chromium',
 'SBoudrias/Inquirer.js',
 'zellij-org/zellij',
 'facebookarchive/pop',
 'thangchung/awesome-dotnet-core',
 'karpathy/minGPT',
 'timqian/chinese-independent-blogs',
 'vueuse/vueuse',
 'tobiasahlin/SpinKit',
 'wekan/wekan',
 'airbnb/visx',

In [8]:
from github import Github
def get_topics(repo):
    if repo.get_topics():
        return repo.get_topics()[0:10]
    else :
        return 'No topics'
    
# 用你的 GitHub 访问令牌替换
ACCESS_TOKEN = ''

# 初始化 Github 客户端
g = Github(ACCESS_TOKEN)

for repo_name in REPO_LIST:
    print(f"begin getting {repo_name}")
    repo = g.get_repo(repo_name)
    repo_info = {
        "Repository Name": repo_name,
        "Topics": get_topics(repo)
    }
    new_data_list = {key: [value] for key, value in repo_info.items()}
    df = pd.DataFrame(new_data_list)
    df.to_csv('repos_info_new_topic.csv', mode='a', header=False, index=False)

begin getting hiroi-sora/Umi-OCR
begin getting MostlyAdequate/mostly-adequate-guide
begin getting StreisandEffect/streisand
begin getting youzan/vant
begin getting gfwlist/gfwlist
begin getting electron-react-boilerplate/electron-react-boilerplate
begin getting kovidgoyal/kitty
begin getting gocolly/colly
begin getting responsively-org/responsively-app
begin getting HeroTransitions/Hero
begin getting Swordfish90/cool-retro-term
begin getting opentofu/opentofu
begin getting dylanaraps/neofetch
begin getting pugjs/pug
begin getting dandavison/delta
begin getting julycoding/The-Art-Of-Programming-By-July-2nd
begin getting typicode/lowdb
begin getting ColorlibHQ/gentelella
begin getting duckdb/duckdb
begin getting pubkey/rxdb
begin getting FelisCatus/SwitchyOmega
begin getting emilk/egui
begin getting JohnCoates/Aerial
begin getting ajeetdsouza/zoxide
begin getting petkaantonov/bluebird
begin getting keepassxreboot/keepassxc
begin getting graphql/graphql-js
begin getting Bin-Huang/chatbox


In [10]:
df1 = pd.read_csv('repo_allbut1.csv')
df2 = pd.read_csv('repos_info_new_topic.csv')
result_df = pd.merge(df1, df2, on='Repository Name', how='left')
result_df.to_csv('repos_info_merge.csv', index=False)

In [11]:
result_df

Unnamed: 0,Repository Name,Description,Code Content,Commit History,README Content,Tags,Topics_x,Topics_y
0,hiroi-sora/Umi-OCR,"OCR software, free and offline. 开源、免费的离线OCR软件。...",File: .gitignore\nFile: CHANGE_LOG.md\nFile: L...,Author: hiroi-sora\nDate: 2024-07-19 08:39:10+...,"<p align=""left"">\n <span>\n <b>中文</b...",Tag: v2.1.3.beta.1\nNo additional tag information,paddleocr,"['paddleocr', 'ocr', 'ocr-python', 'umi-ocr', ..."
1,MostlyAdequate/mostly-adequate-guide,Mostly adequate guide to FP (in javascript),File: .editorconfig\nFile: .generate-summary.p...,Author: KtorZ\nDate: 2023-12-08 10:37:50+00:00...,[![cover](images/cover.png)](SUMMARY.md)\n\n##...,Tag: e5aa315\nNo additional tag information,javascript,"['javascript', 'functional-programming', 'tuto..."
2,StreisandEffect/streisand,Streisand sets up a new server running your ch...,File: .gitignore\nFile: Advanced installation....,Author: Patrick Robertson\nDate: 2020-04-12 01...,"# Streisand\n\n<p align=""center"">\n<img src=""h...",No tags found,vpn,"['vpn', 'ansible', 'openvpn', 'wireguard', 'op..."
3,youzan/vant,"A lightweight, customizable Vue UI library for...",File: .browserslistrc\nFile: .editorconfig\nFi...,Author: inottn\nDate: 2024-07-18 13:56:29+00:0...,"<p align=""center"">\n <img alt=""logo"" src=""h...",Tag: v4.9.2\nNo additional tag information,vue,"['vue', 'mobile', 'components', 'vant', 'vue3'..."
4,gfwlist/gfwlist,The one and only one gfwlist here,File: .gitignore\nFile: COPYING.txt\nFile: REA...,Author: Christopher Meng\nDate: 2024-06-07 03:...,"<a href=""http://info.flagcounter.com/T2RV""><im...",No tags found,china,"['china', 'censorship-circumvention', 'anticen..."
...,...,...,...,...,...,...,...,...
3354,mrmartineau/SublimeTextSetupWiki,Enable people to get started with Sublime Text...,File: README.md\nFile: User/Default (OSX).subl...,Author: Zander Martineau\nDate: 2015-04-13 05:...,# Sublime Text Setup Wiki\n#### This project's...,No tags found,No topics,No topics
3355,exinnet/tclip,智能图片裁剪,File: .gitignore\nFile: README.md\nFile: demo_...,Author: exinnet\nDate: 2017-07-03 13:41:17+00:...,# tclip\r\n\r\n## 名字说明\r\n\r\n* `T`开头，代表是头像识别。...,Tag: 1.0.0\nNo additional tag information,No topics,No topics
3356,exinnet/tclip,智能图片裁剪,File: .gitignore\nFile: README.md\nFile: demo_...,Author: exinnet\nDate: 2017-07-03 13:41:17+00:...,# tclip\r\n\r\n## 名字说明\r\n\r\n* `T`开头，代表是头像识别。...,Tag: 1.0.0\nNo additional tag information,No topics,No topics
3357,euvl/vue-js-popover,:dango: Vue.js 2 library for dropdowns / popov...,File: .babelrc\nFile: .gitignore\nFile: .npmig...,Author: Yev Vlasenko\nDate: 2020-05-10 19:17:0...,[![npm version](https://badge.fury.io/js/vue-j...,Tag: v1.1.8\nNo additional tag information,No topics,No topics
