In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv("/content/cleaned_labeled_logs.csv")

print("Dataset shape:", data.shape)
data.head()


Dataset shape: (445, 3)


Unnamed: 0,block_id,clean_log,label
0,blk_-1055254430948037872,info dfs.fsnamesystem: block* namesystem.ad...,Normal
1,blk_-1094025072435547068,info dfs.datanode$packetresponder: received...,Normal
2,blk_-1110880362999960078,info dfs.fsnamesystem: block* namesystem.de...,Normal
3,blk_-1111344660779690782,info dfs.fsdataset: deleting block blk_- fi...,Normal
4,blk_-1148756466197125339,info dfs.datanode$packetresponder: packetre...,Normal


In [4]:
data["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Normal,428
Anomaly,17


In [5]:
vectorizer = CountVectorizer(
    max_features=500,
    stop_words='english'
)

X = vectorizer.fit_transform(data["clean_log"])
y = data["label"]


In [6]:
print("Feature matrix shape:", X.shape)


Feature matrix shape: (445, 46)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
import joblib
import os

os.makedirs('data/processed', exist_ok=True)
os.makedirs('models', exist_ok=True)

joblib.dump(X_train, "data/processed/X_train.pkl")
joblib.dump(X_test, "data/processed/X_test.pkl")
joblib.dump(y_train, "data/processed/y_train.pkl")
joblib.dump(y_test, "data/processed/y_test.pkl")
joblib.dump(vectorizer, "models/vectorizer.pkl")

print("✅ Feature engineering completed and saved.")

✅ Feature engineering completed and saved.
