# 步骤一：读取爬虫得到的数据

In [124]:
import pandas as pd
data_path = "repos_info_merge.csv"

data_file = pd.read_csv(data_path)

# 步骤二：数据预处理

In [125]:
import ast

#读取到的topic是列表形式的数据，将“No topics”的仓库数据改成空列表
data_file['Topics_y'] = data_file['Topics_y'].apply(lambda x: [] if x == 'No topics' else x)

def safe_literal_eval(topic_entry):
    try:
        return ast.literal_eval(topic_entry)
    except:
        return topic_entry  

data_file['Topics_y'] = data_file['Topics_y'].apply(safe_literal_eval)

data_file['Topics_y'].apply(type).value_counts(), data_file.head()
print(data_file)


                           Repository Name  \
0                       hiroi-sora/Umi-OCR   
1     MostlyAdequate/mostly-adequate-guide   
2                StreisandEffect/streisand   
3                              youzan/vant   
4                          gfwlist/gfwlist   
...                                    ...   
3354      mrmartineau/SublimeTextSetupWiki   
3355                         exinnet/tclip   
3356                         exinnet/tclip   
3357                   euvl/vue-js-popover   
3358            BastiaanJansen/toast-swift   

                                            Description  \
0     OCR software, free and offline. 开源、免费的离线OCR软件。...   
1           Mostly adequate guide to FP (in javascript)   
2     Streisand sets up a new server running your ch...   
3     A lightweight, customizable Vue UI library for...   
4                     The one and only one gfwlist here   
...                                                 ...   
3354  Enable people to get started

In [126]:
#统计数据中，不同 topic 出现的频率
all_topics_safe_parse = set(topic for sublist in data_file['Topics_y'] for topic in sublist)

topic_counts_safe_parse = {topic: 0 for topic in all_topics_safe_parse}
for topic_list in data_file['Topics_y']:
    for topic in topic_list:
        topic_counts_safe_parse[topic] += 1

sorted_topics_safe_parse = sorted(topic_counts_safe_parse.items(), key=lambda x: x[1], reverse=True)

print(sorted_topics_safe_parse)


[('javascript', 100), ('python', 81), ('react', 70), ('android', 69), ('golang', 46), ('nodejs', 44), ('ios', 44), ('hacktoberfest', 41), ('deep-learning', 41), ('go', 39), ('linux', 37), ('typescript', 36), ('swift', 35), ('java', 34), ('php', 33), ('rust', 32), ('vue', 31), ('security', 29), ('awesome', 27), ('macos', 25), ('pytorch', 25), ('machine-learning', 24), ('react-native', 22), ('database', 22), ('redux', 22), ('ruby', 21), ('json', 21), ('cli', 20), ('awesome-list', 20), ('tensorflow', 19), ('webpack', 19), ('animation', 19), ('android-library', 18), ('css', 17), ('windows', 16), ('c', 16), ('library', 16), ('angular', 15), ('api', 15), ('computer-vision', 15), ('video', 14), ('npm', 14), ('cpp', 14), ('frontend', 14), ('cross-platform', 14), ('graphql', 14), ('vuejs', 13), ('reactjs', 13), ('nlp', 13), ('visualization', 13), ('docker', 13), ('tutorial', 13), ('unity', 12), ('rpc', 12), ('ssh', 12), ('electron', 12), ('html', 12), ('rails', 12), ('git', 12), ('devops', 11),

In [127]:
# 定义一个无意义主题的列表，排除这个列表中出现的主题，主要是编程语言
notImportant_topics = ['javascript', 'python', 'react', 'android', 'golang', 'nodejs', 'ios', 'go', 'typescript', 'swift', 'java', 'php', 'rust','hacktoberfest','awesome','awesome-list','json','css','c']

filtered_topics = [topic for topic, count in sorted_topics_safe_parse if topic not in notImportant_topics]

# 取前20个出现频率最高的主题，作为之后分类的类别
selected_topics_extended = filtered_topics[:20]
selected_topics_final =  selected_topics_extended

# 打印查看这20个主题
selected_topics_final

['deep-learning',
 'linux',
 'vue',
 'security',
 'macos',
 'pytorch',
 'machine-learning',
 'react-native',
 'database',
 'redux',
 'ruby',
 'cli',
 'tensorflow',
 'webpack',
 'animation',
 'android-library',
 'windows',
 'library',
 'angular',
 'api']

In [128]:
import numpy as np

# 创建一个topic矩阵，将topic列表数值化，属于上述对应类别的赋值1
topic_matrix = np.zeros((len(data_file), len(selected_topics_final)), dtype=int)

for i, topics in enumerate(data_file['Topics_y']):
    for topic in topics:
        if topic in selected_topics_final:
            topic_matrix[i, selected_topics_final.index(topic)] = 1

topic_matrix[:10]  # 展示处理后的结果


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [129]:
# 创建一个标签列表，将上述矩阵中的数值结果进一步转化
labels = []

# 保存 topic 的类别编号
for i, row in enumerate(topic_matrix):
    label_list = []
    for topic_index, topic_value in enumerate(row):
        if topic_value == 1:
            label_list.append(topic_index)
    labels.append(label_list)


#如果不属于这20个类别中的任一类别，赋值20，代表其他类别
for index, sublist in enumerate(labels):
    if not sublist:
        labels[index] = [20]

labels[:10]



[[20], [20], [20], [2], [20], [13], [20], [20], [9], [14]]

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# 转换成多标签分类形式
mlb = MultiLabelBinarizer()
multilabel_data = mlb.fit_transform(labels)


# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(topic_matrix, multilabel_data, test_size=0.2, random_state=42)

multilabel_data.shape, X_train.shape, X_test.shape, y_train.shape


((3359, 21), (2687, 20), (672, 20), (2687, 21))

In [131]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(topic_matrix, multilabel_data, test_size=0.2, random_state=42)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape,y_train.shape

# 保存数据
np.savetxt('data/X_train_scaled.csv', X_train_scaled, delimiter=',')
np.savetxt('data/X_test_scaled.csv', X_test_scaled, delimiter=',')
np.savetxt('data/y_train.csv', y_train, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')


# 步骤三：使用 MLP 训练

In [132]:
#MLP
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 初始化一个3层的 mlp 模型
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, random_state=42)

mlp_classifier.fit(X_train_scaled, y_train)

y_pred = mlp_classifier.predict(X_test_scaled)

# 计算混淆矩阵，生成分类报告
conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
class_report = classification_report(y_test, y_pred)

conf_matrix, class_report
print(class_report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         6
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         9
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         1
          13       1.00      0.67      0.80         3
          14       1.00      1.00      1.00         2
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         7
          17       1.00    

# 步骤四：使用逻辑回归模型训练

In [133]:
#LR
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier

multi_label_model = MultiOutputClassifier(LogisticRegression())

multi_label_model.fit(X_train_scaled, y_train)

y_pred_log_reg = multi_label_model.predict(X_test_scaled)


conf_matrix_log_reg = confusion_matrix(y_test.argmax(axis=1), y_pred_log_reg.argmax(axis=1))
class_report_log_reg = classification_report(y_test, y_pred_log_reg)

conf_matrix_log_reg, class_report_log_reg
print(class_report_log_reg)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         6
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         9
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         2
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         7
          17       1.00    

# 步骤五：使用 SVM 模型训练

In [134]:
#svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier

svm_model = SVC(kernel='linear')  

multi_label_svm = MultiOutputClassifier(svm_model)

multi_label_svm.fit(X_train_scaled, y_train)

y_pred_svm = multi_label_svm.predict(X_test_scaled)


conf_matrix_svm = confusion_matrix(y_test.argmax(axis=1), y_pred_svm.argmax(axis=1))
class_report_svm = classification_report(y_test, y_pred_svm)

conf_matrix_svm, class_report_svm
print(class_report_svm)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         6
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         9
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         2
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         7
          17       1.00    

# 步骤六： 使用随机森林训练

In [135]:
#RF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier

rf_model = RandomForestClassifier(n_estimators=100)  

multi_label_rf = MultiOutputClassifier(rf_model)

multi_label_rf.fit(X_train_scaled, y_train)

y_pred_rf = multi_label_rf.predict(X_test_scaled)

conf_matrix_rf = confusion_matrix(y_test.argmax(axis=1), y_pred_rf.argmax(axis=1))
class_report_rf = classification_report(y_test, y_pred_rf)

conf_matrix_rf, class_report_rf
print(class_report_rf)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         6
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         9
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         2
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         7
          17       1.00    

# 步骤七：使用 lstm 模型训练

In [142]:
#lstm
import torch
from sklearn.metrics import classification_report


# 将数据转换成张量
X_train_tensor = torch.tensor(X_train_scaled).float()
X_test_tensor = torch.tensor(X_test_scaled).float()

y_train_tensor = torch.tensor(y_train).float()
y_test_tensor = torch.tensor(y_test).float()

y_train_class_indices = y_train_tensor.argmax(axis=1)
y_test_class_indices = y_test_tensor.argmax(axis=1)

unique_classes = y_train_class_indices.unique()


input_dim = X_train_tensor.shape[1]  
hidden_dim = 128  
layer_dim = 1  
output_dim = len(unique_classes)  
batch_size = 32  
dropout_prob = 0.2  

# 定义 dataloader
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_class_indices)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_class_indices)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

class LSTMModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, layer_dim,
                                  batch_first=True, dropout=dropout_prob)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        
        out = self.fc(out[:, -1, :]) 
        return out

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim, dropout_prob)


def train_model_3d(model, train_loader, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        for inputs, targets in train_loader:
            inputs = inputs.view(-1, 1, inputs.shape[1])
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


# 定义 loss 函数和学习率
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 模型训练
train_model_3d(model, train_loader, optimizer)


with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor.view(-1, 1, X_test_tensor.shape[1]))
    y_pred_class_indices = y_pred.argmax(axis=1)

# 在测试集上计算准确率
accuracy = (y_pred_class_indices == y_test_class_indices).sum().item() / len(y_test_class_indices)

print(accuracy)


Epoch 1/10, Loss: 1.064189076423645
Epoch 2/10, Loss: 0.2998838722705841
Epoch 3/10, Loss: 0.03685936704277992
Epoch 4/10, Loss: 0.03438034653663635
Epoch 5/10, Loss: 0.015675978735089302
Epoch 6/10, Loss: 0.012775944545865059
Epoch 7/10, Loss: 0.0063379607163369656
Epoch 8/10, Loss: 0.003052197629585862
Epoch 9/10, Loss: 0.013210666365921497
Epoch 10/10, Loss: 0.003683245973661542
0.9985119047619048
