In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Reload the dataset
file_path = './dataset/git_commit_history_malicious.csv'
df = pd.read_csv(file_path)

df['is_malicious'] = df['is_malicious'].map({'Yes': 1, 'No': 0})
X = df['full_commit_message_markdown']
y = df['is_malicious']

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)

# Models to evaluate
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Support Vector Machine (SVM)": SVC(kernel='linear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

results = {}

for model_name, model in models.items():

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred, zero_division=1)
    f1 = f1_score(y_test, y_pred, zero_division=1)

    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).transpose()

print(results_df)

                              Accuracy  Precision    Recall  F1 Score
Random Forest                 1.000000        1.0  1.000000       1.0
Logistic Regression           0.983333        1.0  0.666667       0.8
Support Vector Machine (SVM)  1.000000        1.0  1.000000       1.0
Decision Tree                 1.000000        1.0  1.000000       1.0
