In [25]:
from tqdm import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from langdetect import detect
from deep_translator import GoogleTranslator
import torch
import ast

In [3]:
df = pd.read_csv('C:/Users/29400/Desktop/data/original_data.csv')
df["topics"] = df["topics"].apply(ast.literal_eval)
model = SentenceTransformer("all-MiniLM-L6-v2")

In [19]:
tqdm.pandas()
predefined_topics = [
    "artificial-intelligence",
    "computer-vision",
    "web-development",
    "cybersecurity",
    "cloud-computing",
    "blockchain",
    "data-science",
    "education",
    "system"
]
predefined_embeddings = model.encode(predefined_topics, convert_to_tensor=True)
def match_topic(repo_topics):
    if not repo_topics:  # 如果 topic 为空，返回 "Unknown"
        return "Unknown"

    # 计算所有 `repo_topics` 的 embedding 并取平均值
    repo_embeddings = model.encode(repo_topics, convert_to_tensor=True)
    avg_repo_embedding = torch.mean(repo_embeddings, dim=0, keepdim=True)  # 计算平均 embedding

    # 计算 repo embedding 与 predefined topics 之间的相似度
    similarities = util.pytorch_cos_sim(avg_repo_embedding, predefined_embeddings)

    # 选择相似度最高的 predefined topic
    best_match_idx = torch.argmax(similarities)
    best_match = predefined_topics[best_match_idx]

    # print(f"{repo_topics} -> {best_match} (Similarity: {similarities[0][best_match_idx]:.4f})")
    return best_match

df["classified_topic"] = df["topics"].progress_apply(match_topic)

100%|██████████| 185876/185876 [07:06<00:00, 435.32it/s]


In [28]:
# 替换模型为多语言 BERT
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
def match_topic_with_readme(row):
    repo_topics = row["topics"]
    repo_readme = row["readme"]

    # 处理空值，确保 readme 不是 NaN
    if pd.isna(repo_readme):
        repo_readme = ""

    if not repo_topics and not repo_readme:
        return "Unknown"

    embeddings = []

    # 计算 repo_topics embedding
    if repo_topics:
        topic_embeddings = model.encode(repo_topics, convert_to_tensor=True)
        avg_topic_embedding = torch.mean(topic_embeddings, dim=0, keepdim=True)
        embeddings.append(avg_topic_embedding)

    # 计算 readme embedding（确保是字符串）
    if isinstance(repo_readme, str) and repo_readme.strip():
        readme_embedding = model.encode(repo_readme, convert_to_tensor=True).unsqueeze(0)
        embeddings.append(readme_embedding)

    # 计算整体 embedding（如果 topics 和 readme 都有，则取平均）
    if embeddings:
        avg_embedding = torch.mean(torch.cat(embeddings), dim=0, keepdim=True)
    else:
        return "Unknown"

    # 计算与 predefined_topics 的相似度
    similarities = util.pytorch_cos_sim(avg_embedding, predefined_embeddings)

    # 选择相似度最高的 topic
    best_match_idx = torch.argmax(similarities)
    best_match = predefined_topics[best_match_idx]

    return best_match

df["classified_topic_readme"] = df.progress_apply(match_topic_with_readme, axis=1)

100%|██████████| 185876/185876 [44:14<00:00, 70.01it/s]  


In [32]:
category = {"artificial-intelligence": 0, "computer-vision": 0, "web-development": 0, "cybersecurity": 0, "system": 0, "cloud-computing": 0, "blockchain": 0, "data-science": 0, "education": 0, "Unknown": 0}
for _, row in df.iterrows():
    classified_topic = row["classified_topic"]
    category[classified_topic] += 1
print(category)

{'artificial-intelligence': 13712, 'computer-vision': 10345, 'web-development': 8621, 'cybersecurity': 6872, 'system': 30351, 'cloud-computing': 5135, 'blockchain': 5479, 'data-science': 6964, 'education': 2690, 'Unknown': 95707}


In [33]:
category = {"artificial-intelligence": 0, "computer-vision": 0, "web-development": 0, "cybersecurity": 0, "system": 0, "cloud-computing": 0, "blockchain": 0, "data-science": 0, "education": 0, "Unknown": 0}
for _, row in df.iterrows():
    classified_topic = row["classified_topic_readme"]
    category[classified_topic] += 1
print(category)

{'artificial-intelligence': 42229, 'computer-vision': 21268, 'web-development': 48254, 'cybersecurity': 16122, 'system': 19400, 'cloud-computing': 4823, 'blockchain': 8112, 'data-science': 8990, 'education': 14698, 'Unknown': 1980}


In [31]:
df.to_csv("C:/Users/29400/Desktop/data/classified_data.csv", index=False)

In [34]:
df = pd.read_csv("C:/Users/29400/Desktop/data/classified_data.csv")

In [39]:

for _, row in df.iterrows():
    if row['classified_topic'] == "artificial-intelligence":
        print(row["topics"])


['captcha-solving', 'gan', 'generative-adversarial-network', 'keras', 'neural-network', 'simgan']
['python', 'reverse-engineering']
['discord', 'discord-rpc', 'rich-presence', 'wine']
['random', 'thrift-store']
['api-hooking', 'cpp', 'pentesting-windows', 'redteaming']
['cnn', 'deep-learning', 'handwriting-recognition', 'machine-learning', 'rnn', 'tensorflow']
['bot', 'bot-api', 'bot-framework', 'nest', 'nestjs', 'nodejs', 'telegraf', 'telegram', 'telegram-bot', 'telegram-bot-api', 'typescript']
['android', 'frida', 'ios', 'mobile', 'reverse-engineering']
['active-directory', 'brute-force', 'information-gathering', 'metasploit-framework', 'penetration-testing']
['amm', 'awesome-list']
['awesome-list', 'derivatives']
['enumeration', 'persistence', 'privelage-escalation']
['artificial-neural-networks', 'blas', 'c', 'c99', 'cblas', 'classification', 'continuous-integration', 'efficient', 'embedded', 'feedforward-neural-network', 'header-only', 'machine-learning', 'matrix', 'neural-network