In [2]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


# 函数

In [3]:
def calculate_readme_size(readme):
    if isinstance(readme, str):
        return len(readme)
    else:
        return 0


def calculate_lines_of_codes(languages):
    if isinstance(languages, dict):
        return sum(languages.values())
    return 0


def calculate_popularity_score_1(stars, forks, pull_requests):
    return stars + forks + pull_requests * pull_requests


def calculate_popularity_score_2(watchers, pull_requests):
    return watchers + pull_requests * pull_requests


def calculate_popularity_score_3(stars, forks):
    return stars + forks

def classify_topics(topics):
    if topics is None or topics is [] or len(topics) == 0:
        return "unknown"
    model = SentenceTransformer('all-MiniLM-L6-v2')
    candidate_labels = ["operating system", "networking", "cybersecurity", "software development", "web", "AI", "database", "documentation", "programming language"]
    topic_embeddings = model.encode(topics)
    label_embeddings = model.encode(candidate_labels)

    # 计算 topic 向量与 candidate label 向量的余弦相似度
    similarities = util.pytorch_cos_sim(topic_embeddings, label_embeddings)

    # 找到每个 topic 的最佳匹配类别
    best_labels = [candidate_labels[idx] for idx in similarities.argmax(dim=1)]

    # 选择出现最多的分类作为最终分类
    return max(set(best_labels), key=best_labels.count)


# 获取所有文件名

In [4]:
tqdm.pandas()
file_paths = []
for f_name in os.listdir('data'):
    if f_name.endswith('json'):
        file_paths.append("data/" + f_name)

print(f"Total files detected: {len(file_paths)}")

Total files detected: 185876


# 读取文件并创建Pandas Dataframe

In [5]:
error_count = 0
rows = []

for file_path in file_paths:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            pull_requests = data.get("pull_requests", 0)  # 提取 pull_requests

            for url, details in data.items():
                if url != "pull_requests":  # 排除 pull_requests 键
                    row = {"url": url, "name": url.split('/')[-1], "owner": url.split('/')[-2]}
                    row.update(details)
                    row["pull_requests"] = pull_requests  # 添加 pull_requests 到每行数据中
                    rows.append(row)
    except Exception as e:
        error_count += 1

df = pd.DataFrame(rows)
print(f"Total errors: {error_count}")
print(f"Raw data size {len(df)}")

Total errors: 0
Raw data size 185876


# 新增readme_size，lines_of_codes，popularity_score_1(star + fork + pr^2)，popularity_score_2(watcher + pr^2)和popularity_score_3(star + fork)标签

In [6]:
df["readme_size"] = df["readme"].apply(calculate_readme_size)
df["lines_of_codes"] = df["languages"].apply(calculate_lines_of_codes)
df["popularity_score_1"] = df.apply(lambda row: calculate_popularity_score_1(row["stars"], row["forks"], row["pull_requests"]), axis=1)
df["popularity_score_2"] = df.apply(lambda row: calculate_popularity_score_2(row["watchers"], row["pull_requests"]), axis=1)
df["popularity_score_3"] = df.apply(lambda row: calculate_popularity_score_3(row["stars"], row["forks"]), axis=1)
print(f"Column names are: {list(df.columns)}")

Column names are: ['url', 'name', 'owner', 'forks', 'watchers', 'stars', 'languages', 'commits', 'creation_date', 'contributors', 'topics', 'subscribers', 'readme', 'releases', 'pull_requests', 'readme_size', 'lines_of_codes', 'popularity_score_1', 'popularity_score_2', 'popularity_score_3']


# 统计语言种类

In [7]:
languages = []
for repo_languages in df["languages"]:
    if isinstance(repo_languages, dict):  # 确保 repo_languages 是字典类型
        for repo_language in repo_languages:
            if repo_language not in languages:
                languages.append(repo_language)
print(f"There are total {len(languages)} languages")

There are total 541 languages


# 为topic分类

In [8]:
tqdm.pandas()
df.iterrows()[0]

TypeError: 'generator' object is not subscriptable

Unnamed: 0,url,name,owner,forks,watchers,stars,languages,commits,creation_date,contributors,topics,subscribers,readme,releases,pull_requests,readme_size,lines_of_codes,popularity_score_1,popularity_score_2,popularity_score_3
0,https://api.github.com/repos/0-1-0/lightblue-0.4,lightblue-0.4,0-1-0,91,95,95,"{'Python': 415988, 'Objective-C': 169036, 'C':...","[2020-10-18T21:26:07Z, 2015-02-11T14:38:50Z, 2...",2012-09-24T02:58:25Z,4,[],14,LightBlue\r\n\r\nLightBlue is a cross-platform...,[],6,3582,686294,222,131,186
1,https://api.github.com/repos/0-8-4/miui-auto-t...,miui-auto-tasks,0-8-4,180,991,991,"{'Python': 61598, 'Shell': 1087, 'Dockerfile':...","[2025-01-27T05:35:37Z, 2025-01-25T14:24:35Z, 2...",2021-08-16T10:04:15Z,17,"[docker, miui, python, xiaomi]",7,# MIUI Task\n一个适用于 社区 4.0 模拟网络功能请求的脚本\n\n[![99...,"[{'name': 'v1.8.2-hotfix1', 'date': '2025-01-2...",113,3901,63304,13940,13760,1171
2,https://api.github.com/repos/00-Evan/shattered...,shattered-pixel-dungeon,00-Evan,1158,4936,4936,"{'Java': 6219833, 'Shell': 1977}","[2024-10-09T16:48:17Z, 2024-10-08T18:41:17Z, 2...",2014-07-31T21:56:31Z,2,"[android, game, game-development, ios, java, l...",106,# Shattered Pixel Dungeon\n\n[Shattered Pixel ...,"[{'name': 'v3.0.0-BETA-2.1', 'date': '2024-12-...",46,2467,6221810,8210,7052,6094
3,https://api.github.com/repos/0015/ThatProject,ThatProject,0015,947,1921,1921,"{'C': 41821057, 'C++': 1327597, 'Dart': 510279...","[2024-07-10T06:52:50Z, 2024-06-09T07:27:22Z, 2...",2019-07-04T08:16:56Z,2,"[camera, embedded-project, esp32, esp32-cam, e...",129,# [That Project](https://youtube.com/@ThatProj...,[],34,27483,43896590,4024,3077,2868
4,https://api.github.com/repos/007revad/Synology...,Synology_HDD_db,007revad,195,2913,2913,{'Shell': 86699},"[2025-01-23T06:12:05Z, 2025-01-23T06:11:28Z, 2...",2023-02-26T02:59:13Z,11,"[diskstation, dsm, rackstation, synology, syno...",51,"# Synology HDD db\n\n<a href=""https://github.c...","[{'name': 'v3.6.110', 'date': '2024-12-31T10:1...",226,16443,86699,54184,53989,3108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185871,https://api.github.com/repos/zzzprojects/Syste...,System.Linq.Dynamic.Core,zzzprojects,230,1597,1597,"{'C#': 1751766, 'HTML': 38859, 'CSS': 26919, '...","[2025-01-19T07:29:42Z, 2024-12-25T08:10:24Z, 2...",2016-04-08T16:41:51Z,30,"[core, dynamic, efcore, entity-framework, enti...",52,### Library Powered By\n\nThis library is powe...,"[{'name': 'v.1.5.1', 'date': '2024-12-22T10:28...",318,6902,1819025,102951,102721,1827
185872,https://api.github.com/repos/zzzprojects/Syste...,System.Linq.Dynamic,zzzprojects,194,424,424,{},"[2024-03-20T20:53:54Z, 2021-04-06T14:35:00Z, 2...",2011-12-12T19:01:13Z,11,"[efcore, entity-framework-core]",57,### Dynamic LINQ is Powered By\n\nThis library...,[],41,2342,0,2299,2105,618
185873,https://api.github.com/repos/zzzprojects/Z.Ext...,Z.ExtensionMethods,zzzprojects,327,1623,1623,"{'C#': 2614721, 'Visual Basic .NET': 1411898}","[2024-03-20T20:50:19Z, 2023-07-05T14:46:07Z, 2...",2015-03-03T14:49:23Z,3,"[csharp, dotnet, dotnet-core, extension-method...",108,## Library Powered By\n\nThis library is power...,"[{'name': 'v2.1.1', 'date': '2019-01-12T18:26:...",15,3996,4026619,2175,1848,1950
185874,https://api.github.com/repos/zzzzbw/Fame,Fame,zzzzbw,72,201,201,"{'Java': 215298, 'Vue': 121055, 'JavaScript': ...","[2022-12-20T14:34:51Z, 2022-12-14T12:38:56Z, 2...",2017-08-07T13:44:22Z,3,"[docker, docker-compose, fame, java, maven, my...",14,"<p align=""center"">\n <img align=""center"" src=...","[{'name': '', 'date': '2022-12-20T10:31:45Z'},...",14,2431,391989,469,397,273
