#### Week Four: This script test various models to the datset that you have selected. This one is a bit faster than the B Script

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Step 1: Load the dataset
df = pd.read_csv('extracted_honeypot_logs.csv')

# Step 2: Preprocess the dataset
# Handle date/time columns (if any)
# Assuming 'date_column' is the name of your date/time column
if 'date_column' in df.columns:
    df['date_column'] = pd.to_datetime(df['date_column'])
    df['year'] = df['date_column'].dt.year
    df['month'] = df['date_column'].dt.month
    df['day'] = df['date_column'].dt.day
    df['hour'] = df['date_column'].dt.hour
    df['minute'] = df['date_column'].dt.minute
    df['second'] = df['date_column'].dt.second
    df.drop(columns=['date_column'], inplace=True)

# Assuming the last column is the target and the rest are features
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Encode categorical target variable if necessary
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert any remaining categorical features to numerical
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align the training and testing data (in case get_dummies created different columns)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Define the models to test
models = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Step 4: Train each model and evaluate accuracy
accuracy_results = {}

for model_name, model in models.items():
    # Perform cross-validation to evaluate the models
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    accuracy_results[model_name] = cv_scores.mean()

# Step 5: Determine the best model based on accuracy
best_model_name = max(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_model_name]

# Step 6: Print the results
print("Model Comparison Results:")
for model_name, accuracy in accuracy_results.items():
    print(f"{model_name}: {accuracy:.4f}")

print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy:.4f}")

# Optional: Fit the best model to the entire training set and test on the test set
best_model = models[best_model_name]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy of the Best Model ({best_model_name}): {test_accuracy:.4f}")


KeyboardInterrupt: 