In [None]:
!pip install keras numpy pandas tensorflow transformers scikit-learn



In [None]:
import json

import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

In [None]:
# Tải model CodeBERT
codebert_model = transformers.AutoModel.from_pretrained("microsoft/codebert-base")
codebert_tokenizer = transformers.AutoTokenizer.from_pretrained(
    "microsoft/codebert-base"
)

In [None]:
# Hàm để tạo embedding cho code
def create_embedding(code):
    inputs = codebert_tokenizer(
        code, return_tensors="pt", truncation=True, padding=True
    )
    outputs = codebert_model(**inputs)
    embedding = outputs.last_hidden_state.mean(axis=1).detach().numpy()
    return embedding

In [None]:
# Đường dẫn đến file JSON
json_file = "/content/dataset.rust.json"

# Đọc dữ liệu từ file JSON
with open(json_file, "r") as f:
    data = json.load(f)

# Tạo DataFrame từ dữ liệu
df = pd.DataFrame(data)

In [None]:
# Tạo embedding cho code
df["embedding"] = df["func"].apply(create_embedding)

In [None]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(
    df["embedding"].values, df["target"].values, test_size=0.2, shuffle=False
)

# Chuyển đổi dữ liệu thành dạng numpy array
X_train = np.array([x[0] for x in X_train])
X_test = np.array([x[0] for x in X_test])

# Xây dựng model LSTM
model = keras.Sequential(
    [
        keras.layers.Input(shape=(768,)),  # Kích thước embedding của CodeBERT
        keras.layers.Reshape((1, 768)),  # Reshape để phù hợp với LSTM
        keras.layers.LSTM(128),
        keras.layers.Dense(1, activation="sigmoid"),  # Phân loại nhị phân
    ]
)

# Biên dịch model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Huấn luyện model
model.fit(X_train, y_train, epochs=100, batch_size=8)

Epoch 1/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0133
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0120
Epoch 3/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0085
Epoch 4/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0136
Epoch 5/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0087
Epoch 6/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0157
Epoch 7/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9653 - loss: 0.0601
Epoch 8/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8635 - loss: 0.3175
Epoch 9/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79f06dff6cb0>

In [None]:
# Đánh giá model
loss, _ = model.evaluate(X_test, y_test, verbose=0)  # Get loss, ignore accuracyaccuracy
y_pred_prob = model.predict(X_test)  # Get predicted probabilities
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
pr_auc = average_precision_score(y_test, y_pred_prob)
mcc = matthews_corrcoef(y_test, y_pred)
error_rate = 1 - accuracy

# Print the metrics
print("Loss:", loss)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)
print("Precision-Recall AUC:", pr_auc)
print("MCC:", mcc)
print("Error Rate:", error_rate)


Loss: 1.3984566926956177
Accuracy: 0.8035714285714286
Precision: 0.8076923076923077
Recall: 0.7777777777777778
F1-score: 0.7924528301886793
ROC AUC: 0.8837803320561941
Precision-Recall AUC: 0.8284139987006222
MCC: 0.606527028944757
Error Rate: 0.1964285714285714
