<a href="https://colab.research.google.com/github/Varsh999/MLprojectssss/blob/main/kfoldproject1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# -------------------------------
# 1. Load dataset
# -------------------------------
df = pd.read_csv("D:/network_logs_1000.csv")

print("Dataset columns:", df.columns.tolist())  # 👈 check actual names

# -------------------------------
# 2. Features and target
# -------------------------------
features = [
    "bytes_in", "bytes_out", "creation_time", "end_time",
    "src_ip", "src_ip_country_code", "protocol",
    "response.code", "dst_port", "dst_ip"
]
target = "label"

# only keep columns that exist in dataset
features = [col for col in features if col in df.columns]

X = df[features].copy()
y = df[target]

# -------------------------------
# 3. Encode categorical features
# -------------------------------
categorical_cols = [col for col in features if X[col].dtype == "object"]

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

# -------------------------------
# 4. K-Fold Training
# -------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(random_state=42)

fold = 1
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    accuracies.append(acc)
    print(f"Fold {fold} Accuracy: {acc:.4f}")
    fold += 1

print("\nAverage Accuracy:", sum(accuracies) / len(accuracies))

# -------------------------------
# 5. Example Prediction
# -------------------------------
new_log = {
    "bytes_in": 5602,
    "bytes_out": 12990,
    "creation_time": "2024-04-25T23:00:00Z",
    "end_time": "2024-04-25T23:10:00Z",
    "src_ip": "147.161.161.82",
    "src_ip_country_code": "AE",
    "protocol": "HTTPS",
    "response.code": 200,
    "dst_port": 443,
    "dst_ip": "10.138.69.9"
}

# keep only features used in training
new_log = {k: v for k, v in new_log.items() if k in features}

# encode categorical columns
for col in categorical_cols:
    if new_log[col] in encoders[col].classes_:
        new_log[col] = encoders[col].transform([new_log[col]])[0]
    else:
        new_log[col] = -1  # unseen value

df_log = pd.DataFrame([new_log])
prediction = model.predict(df_log)[0]

print("\nPredicted Label:", prediction)


Dataset columns: ['bytes_in', 'bytes_out', 'creation_time', 'end_time', 'src_ip', 'src_ip_country_code', 'protocol', 'response_code', 'dst_port', 'dst_ip', 'label']
Fold 1 Accuracy: 1.0000
Fold 2 Accuracy: 1.0000
Fold 3 Accuracy: 1.0000
Fold 4 Accuracy: 1.0000
Fold 5 Accuracy: 1.0000

Average Accuracy: 1.0

Predicted Label: Normal
