In [4]:
# === My new cell ===
# Export cleaned train and test CSVs for later sampling

import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# 1. Paths to raw CSVs
data_dir  = Path(r"C:\Users\Raghav Singla\Desktop\PBL-4")
train_fp  = data_dir / "UNSW_NB15_training-set.csv"
test_fp   = data_dir / "UNSW_NB15_testing-set.csv"

# 2. Load
df_train = pd.read_csv(train_fp)
df_test  = pd.read_csv(test_fp)

# 3. Drop unneeded columns
drop_cols = ["id", "attack_cat"]
X_train = df_train.drop(columns=drop_cols + ["label"])
y_train = df_train["label"]
X_test  = df_test.drop(columns=drop_cols + ["label"])
y_test  = df_test["label"]

# 4. Label‐encode categorical features
cat_cols = ["proto", "service", "state"]
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X_train[col], X_test[col]]).astype(str)
    le.fit(combined)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

# 5. Reattach labels
clean_train = X_train.copy()
clean_train["label"] = y_train
clean_test  = X_test.copy()
clean_test["label"]  = y_test

# 6. Export to disk
train_out = data_dir / "UNSW_NB15_training_cleaned.csv"
test_out  = data_dir / "UNSW_NB15_testing_cleaned.csv"
clean_train.to_csv(train_out, index=False)
clean_test.to_csv(test_out, index=False)

print("✔ Exported:")
print(f"  • {train_out.name}: {clean_train.shape[0]} rows × {clean_train.shape[1]} cols")
print(f"  • {test_out.name}:  {clean_test.shape[0]} rows × {clean_test.shape[1]} cols")


✔ Exported:
  • UNSW_NB15_training_cleaned.csv: 175341 rows × 43 cols
  • UNSW_NB15_testing_cleaned.csv:  82332 rows × 43 cols
