In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
# Loading data
attack_df = pd.read_csv("../processed_data/attack/attack_dataset.csv")
training_df = pd.read_csv("../processed_data/training/training_dataset.csv")
validation_df = pd.read_csv("../processed_data/validation/validation_dataset.csv")

In [5]:
# Labeling
attack_df["label"] = 1  # Attack
training_df["label"] = 0  # Normal
validation_df["label"] = 0  # Normal

# Merge columns (witout attack)
attack_df = attack_df.drop(columns=["attack_type"], errors="ignore")

# Concatenate
full_df = pd.concat([attack_df, training_df, validation_df], ignore_index=True)

# Split features and labels
X = full_df[["syscall_id", "frequency"]]
y = full_df["label"]

# Split data in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Training model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "../models/model.pkl")
print("\nModelo guardado en 'models/model.pkl'")

Confusion Matrix:
 [[58 26]
 [34 79]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.69      0.66        84
           1       0.75      0.70      0.72       113

    accuracy                           0.70       197
   macro avg       0.69      0.69      0.69       197
weighted avg       0.70      0.70      0.70       197


Modelo guardado en 'models/model.pkl'
