In [None]:
!pip install keras numpy pandas tensorflow transformers scikit-learn gensim



In [None]:
import json

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

In [None]:
# Tải model CodeBERT
codebert_model = transformers.AutoModel.from_pretrained("microsoft/codebert-base")
codebert_tokenizer = transformers.AutoTokenizer.from_pretrained(
    "microsoft/codebert-base"
)

In [None]:
# Hàm để tạo embedding cho code
def create_embedding(code):
    inputs = codebert_tokenizer(
        code, return_tensors="pt", truncation=True, padding=True
    )
    outputs = codebert_model(**inputs)
    embedding = outputs.last_hidden_state.mean(axis=1).detach().numpy()
    return embedding

In [None]:
# Đường dẫn đến file JSON
json_file = "/content/dataset.rust.json"

# Đọc dữ liệu từ file JSON
with open(json_file, "r") as f:
    data = json.load(f)

# Tạo DataFrame từ dữ liệu
df = pd.DataFrame(data)

In [None]:
# Tạo embedding cho code
df["embedding"] = df["func"].apply(create_embedding)

In [None]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(
    df["embedding"].values, df["target"].values, test_size=0.2, shuffle=False
)

# Chuyển đổi dữ liệu thành dạng numpy array
X_train = np.array([x[0] for x in X_train])
X_test = np.array([x[0] for x in X_test])

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=100)  # Increase max_iter if needed

In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1

In [None]:
# Assuming 'model' is your trained Logistic Regression model
iterations_taken = model.n_iter_
print("Iterations taken:", iterations_taken)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
pr_auc = average_precision_score(y_test, y_pred_prob)
mcc = matthews_corrcoef(y_test, y_pred)
error_rate = 1 - accuracy

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)
print("Precision-Recall AUC:", pr_auc)
print("MCC:", mcc)
print("Error Rate:", error_rate)


Iterations taken: [69]
Accuracy: 0.8392857142857143
Precision: 0.8214285714285714
Recall: 0.8518518518518519
F1-score: 0.8363636363636363
ROC AUC: 0.89272030651341
Precision-Recall AUC: 0.8983631885913974
MCC: 0.6790046053972701
Error Rate: 0.1607142857142857
