# Step 1: Dataset Loading and Inspection

The dataset is loaded and basic structural analysis is performed.
This includes checking dimensions, data types, and missing values.



In [53]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
df = pd.read_csv("/content/drive/MyDrive/Log_Anamoly_Detection/dataset/balanced_log_dataset.csv")

In [55]:
df.head()

Unnamed: 0,BlockId,Label,Type,Features,TimeInterval,Latency
0,blk_8462687553742484299,Fail,5.0,"[E5,E5,E5,E22,E11,E9,E11,E9,E11,E9,E26,E26,E26...","[0.0, 0.0, 0.0, 70.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7156
1,blk_9151689281583792216,Fail,1.0,"[E22,E5,E5,E5,E26,E26,E11,E9,E11,E9,E11,E9,E27...","[0.0, 0.0, 0.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",39300
2,blk_5446956458314793726,Success,,"[E5,E5,E22,E5,E11,E9,E11,E9,E11,E9,E26,E26,E26]","[0.0, 0.0, 1.0, 47.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",48
3,blk_6632164064180342420,Success,,"[E5,E5,E22,E5,E11,E9,E11,E9,E26,E26,E26,E11,E9]","[0.0, 0.0, 1.0, 46.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",48
4,blk_-427522761434588674,Fail,4.0,"[E5,E5,E22,E5,E9,E11,E9,E11,E9,E26,E26,E11,E26...","[0.0, 0.0, 3.0, 35.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",48073


In [56]:
df.shape


(33676, 6)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33676 entries, 0 to 33675
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   BlockId       33676 non-null  object 
 1   Label         33676 non-null  object 
 2   Type          16838 non-null  float64
 3   Features      33676 non-null  object 
 4   TimeInterval  33676 non-null  object 
 5   Latency       33676 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.5+ MB


In [58]:
df.describe()

Unnamed: 0,Type,Latency
count,16838.0,33676.0
mean,9.375638,15338.970483
std,11.34426,18068.625415
min,0.0,0.0
25%,3.0,48.0
50%,5.0,6086.0
75%,8.0,32466.75
max,31.0,53924.0


# Step 2: Data Cleaning and Encoding
Missing values handled.
Categorical labels encoded.
Event sequences parsed into structured format.
Class distribution examined.



In [59]:
df.isnull().sum()

Unnamed: 0,0
BlockId,0
Label,0
Type,16838
Features,0
TimeInterval,0
Latency,0


In [60]:
df.fillna(0, inplace=True)

In [61]:
df["Label"] = df["Label"].map({"Success": 0, "Fail": 1})

In [62]:
df["Features"] = df["Features"].apply(
    lambda x: x.strip("[]").split(",")
)

In [64]:
df["Label"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
1,0.5
0,0.5


# Step 3: Feature Engineering

Event-based features were extracted using CountVectorizer.
Statistical features were derived from time-based log values.



In [65]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    lowercase=False
)

X_events = vectorizer.fit_transform(df["Features"])



In [66]:
df["TimeInterval"] = df["TimeInterval"].apply(
    lambda x: list(map(float, x.strip("[]").split(",")))
)

df["time_mean"] = df["TimeInterval"].apply(np.mean)
df["time_max"] = df["TimeInterval"].apply(np.max)
df["time_sum"] = df["TimeInterval"].apply(np.sum)
df["time_std"] = df["TimeInterval"].apply(np.std)
df["time_nonzero"] = df["TimeInterval"].apply(lambda x: np.count_nonzero(x))

In [67]:
X_numeric = df[[
    "Type",
    "Latency",
    "time_mean",
    "time_max",
    "time_sum",
    "time_std",
    "time_nonzero"
]]
X_numeric.isna().sum()

Unnamed: 0,0
Type,0
Latency,0
time_mean,0
time_max,0
time_sum,0
time_std,0
time_nonzero,0


In [68]:
X_numeric.isna().sum()
X_numeric = X_numeric.fillna(0)

# Step 4: Feature Integration and Dataset Splitting

Sparse event vectors were combined with statistical features.
The dataset was split into training and testing sets using stratified sampling.



In [None]:
from scipy.sparse import hstack
X = hstack([X_events, X_numeric.values])

y = df["Label"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
