In [1]:
from joblib import dump
from datasets import Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
seed=64

## 0. 原始数据

In [3]:
train_data = Dataset.load_from_disk("/Users/xiaoen/Documents/科研/论文/GraphRAG/Code/LinkPrediction/TrainData")
test_data = Dataset.load_from_disk("/Users/xiaoen/Documents/科研/论文/GraphRAG/Code/LinkPrediction/TestData")

In [4]:
train_x = train_data['feature']
train_y = [y for y in train_data['label']]

In [5]:
test_x = test_data['feature']
test_y = [[y] for y in test_data['label']]

In [6]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [7]:
dump(scaler, 'model/scaler.joblib')

['model/scaler.joblib']

## 1. 准备模型

In [8]:
svm_model = SVC(kernel='linear', random_state=seed, probability=True, max_iter=100)

In [9]:
dt_model = DecisionTreeClassifier(random_state=seed)

In [10]:
rf_model = RandomForestClassifier(n_estimators=50, random_state=seed, n_jobs=-1)

In [11]:
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=seed)

In [12]:
lr_model = LogisticRegression(random_state=seed, max_iter=100)

In [13]:
knn_model = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)

In [14]:
nb_model = GaussianNB()

In [15]:
xgb_model = XGBClassifier(random_state=seed, n_jobs=-1)

In [16]:
lgb_model = LGBMClassifier(random_state=seed)

## 2. 训练模型
### 2.0 计算指标

In [17]:
def get_metrics(out, label):
    index = sorted(range(len(out)), key=lambda k: out[k], reverse=True)[:int(len(out)/2)]
    pred = [0]*len(out)
    for i in index:
        pred[i] = 1
    return accuracy_score(label, pred), roc_auc_score(label, out)

### 2.1 SVM

In [18]:
svm_model.fit(train_x, train_y)



In [19]:
y_pred = svm_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.06557377 0.00002240 0.00004479    357074
           1  0.49992577 0.99968074 0.66652974    357074

    accuracy                      0.49985157    714148
   macro avg  0.28274977 0.49985157 0.33328726    714148
weighted avg  0.28274977 0.49985157 0.33328726    714148


In [20]:
y_pred = svm_model.predict_proba(test_x)[:,0]
print(get_metrics(y_pred, test_y))

(0.8034132980838705, 0.8258111509457647)


In [21]:
dump(svm_model, 'model/svm_model.joblib')

['model/svm_model.joblib']

### 2.2 Decision Tree

In [22]:
dt_model.fit(train_x, train_y)

In [23]:
y_pred = dt_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.76154132 0.90145460 0.82561228    357074
           1  0.87927444 0.71773078 0.79033221    357074

    accuracy                      0.80959269    714148
   macro avg  0.82040788 0.80959269 0.80797224    714148
weighted avg  0.82040788 0.80959269 0.80797224    714148


In [24]:
y_pred = dt_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8522939222682133, 0.8327478849627905)


In [25]:
dump(dt_model, 'model/dt_model.joblib')

['model/dt_model.joblib']

### 2.3 Random Forest

In [26]:
rf_model.fit(train_x, train_y)

In [27]:
y_pred = rf_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.79362414 0.92111999 0.85263223    357074
           1  0.90602249 0.76046982 0.82688981    357074

    accuracy                      0.84079491    714148
   macro avg  0.84982331 0.84079491 0.83976102    714148
weighted avg  0.84982331 0.84079491 0.83976102    714148


In [28]:
y_pred = rf_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8743229694685135, 0.915705506264997)


In [29]:
dump(rf_model, 'model/rf_model.joblib')

['model/rf_model.joblib']

### 2.4 Gradient Boosting

In [30]:
gb_model.fit(train_x, train_y)

In [31]:
y_pred = gb_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.80332460 0.91895237 0.85725706    357074
           1  0.90532521 0.77501582 0.83511781    357074

    accuracy                      0.84698410    714148
   macro avg  0.85432491 0.84698410 0.84618744    714148
weighted avg  0.85432491 0.84698410 0.84618744    714148


In [32]:
y_pred = gb_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8782801324095285, 0.9227320626709974)


In [33]:
dump(gb_model, 'model/gb_model.joblib')

['model/gb_model.joblib']

### 2.5 Logistic Regression

In [34]:
lr_model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
y_pred = lr_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.78652804 0.92670147 0.85088038    357074
           1  0.91080539 0.74848351 0.82170478    357074

    accuracy                      0.83759249    714148
   macro avg  0.84866672 0.83759249 0.83629258    714148
weighted avg  0.84866672 0.83759249 0.83629258    714148


In [36]:
y_pred = lr_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8101149901701048, 0.9007084525058959)


In [37]:
dump(lr_model, 'model/lr_model.joblib')

['model/lr_model.joblib']

### 2.6 KNN

In [38]:
knn_model.fit(train_x, train_y)

In [39]:
y_pred = knn_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.75784736 0.49940068 0.60206018    357074
           1  0.62670462 0.84042803 0.71799933    357074

    accuracy                      0.66991436    714148
   macro avg  0.69227599 0.66991436 0.66002975    714148
weighted avg  0.69227599 0.66991436 0.66002975    714148


In [40]:
y_pred = knn_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8404280345250564, 0.6866151013628988)


In [41]:
dump(knn_model, 'model/knn_model.joblib')

['model/knn_model.joblib']

### 2.7 Naive Bayes

In [42]:
nb_model.fit(train_x, train_y)

In [43]:
y_pred = nb_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.74106616 0.95948179 0.83624750    357074
           1  0.94254923 0.66475016 0.77964287    357074

    accuracy                      0.81211598    714148
   macro avg  0.84180769 0.81211598 0.80794518    714148
weighted avg  0.84180769 0.81211598 0.80794518    714148


In [44]:
y_pred = nb_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8012764861065214, 0.887270723245158)


In [45]:
dump(nb_model, 'model/nb_model.joblib')

['model/nb_model.joblib']

### 2.8 XGBoost

In [46]:
xgb_model.fit(train_x, train_y)

In [47]:
y_pred = xgb_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.79960737 0.92850782 0.85925024    357074
           1  0.91476802 0.76730314 0.83457156    357074

    accuracy                      0.84790548    714148
   macro avg  0.85718769 0.84790548 0.84691090    714148
weighted avg  0.85718769 0.84790548 0.84691090    714148


In [48]:
y_pred = xgb_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8940387706749862, 0.924169672931195)


In [49]:
dump(xgb_model, 'model/xgb_model.joblib')

['model/xgb_model.joblib']

### 2.9 LightGBM

In [50]:
lgb_model.fit(train_x, train_y)

[LightGBM] [Info] Number of positive: 366648, number of negative: 366648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 733296, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [51]:
y_pred = lgb_model.predict(test_x)
print(classification_report(test_y, y_pred, digits=8))

              precision    recall  f1-score   support

           0  0.80135814 0.92668747 0.85947794    357074
           1  0.91309602 0.77029131 0.83563648    357074

    accuracy                      0.84848939    714148
   macro avg  0.85722708 0.84848939 0.84755721    714148
weighted avg  0.85722708 0.84848939 0.84755721    714148


In [52]:
y_pred = lgb_model.predict_proba(test_x)[:,1]
print(get_metrics(y_pred, test_y))

(0.8910645972543506, 0.924807254463814)


In [53]:
dump(lgb_model, 'model/lgb_model.joblib')

['model/lgb_model.joblib']