In [1]:
from pathlib import Path
import sys, tarfile
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

project_root = Path('..').resolve()
print('Project root:', project_root)

src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

tar_path = project_root / 'data' / 'raw' / 'behavior' / 'r4.2.tar.bz2'
extracted_csv = project_root / 'data' / 'raw' / 'behavior' / 'r4.2' / 'file.csv'

if extracted_csv.exists():
    df_beh_raw = pd.read_csv(extracted_csv)
    print('Loaded:', extracted_csv)
else:
    with tarfile.open(tar_path, 'r:bz2') as tf:
        member = next(m for m in tf.getmembers() if m.name.lower().endswith('.csv'))
        print('Reading from tar member:', member.name)
        with tf.extractfile(member) as f:
            df_beh_raw = pd.read_csv(f)

print('Raw shape:', df_beh_raw.shape)
df_beh_raw.head()


Project root: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence
Loaded: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/behavior/r4.2/file.csv
Raw shape: (445581, 6)
Loaded: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/behavior/r4.2/file.csv
Raw shape: (445581, 6)


Unnamed: 0,id,date,user,pc,filename,content
0,{L9G8-J9QE34VM-2834VDPB},01/02/2010 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc,D0-CF-11-E0-A1-B1-1A-E1 during difficulty over...
1,{H0W6-L4FG38XG-9897XTEN},01/02/2010 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf,25-50-44-46-2D carpenters 25 landed strait dis...
2,{M3Z0-O2KK89OX-5716MBIM},01/02/2010 08:12:03,HPH0075,PC-2417,D3D3WC9W.doc,D0-CF-11-E0-A1-B1-1A-E1 union 24 declined impo...
3,{E1I4-S4QS61TG-3652YHKR},01/02/2010 08:17:00,HPH0075,PC-2417,QCSW62YS.doc,D0-CF-11-E0-A1-B1-1A-E1 becoming period begin ...
4,{D4R7-E7JL45UX-0067XALT},01/02/2010 08:24:57,HSB0196,PC-8001,AU75JV6U.jpg,FF-D8


## Data cleaning
Standardize columns, drop duplicates/empty columns, and fill missing values.

In [2]:
# Standardize column names
df_beh_raw.columns = [c.strip() for c in df_beh_raw.columns]

# Drop all-empty columns
empty_cols = [c for c in df_beh_raw.columns if df_beh_raw[c].isna().all()]
if empty_cols:
    print('Dropping empty columns:', empty_cols)
    df_beh_raw = df_beh_raw.drop(columns=empty_cols)

# Drop duplicates
before = len(df_beh_raw)
df_beh_raw = df_beh_raw.drop_duplicates().reset_index(drop=True)
print(f"Dropped {before - len(df_beh_raw)} duplicates")

# Fill missing values
num_cols = df_beh_raw.select_dtypes(include=['number']).columns.tolist()
cat_cols = df_beh_raw.select_dtypes(exclude=['number']).columns.tolist()
if num_cols:
    df_beh_raw[num_cols] = df_beh_raw[num_cols].fillna(df_beh_raw[num_cols].median())
for col in cat_cols:
    if df_beh_raw[col].isna().any():
        df_beh_raw[col] = df_beh_raw[col].fillna(df_beh_raw[col].mode().iloc[0])

print('Cleaned behavior shape:', df_beh_raw.shape)
print('Null counts after cleaning (top 10):')
print(df_beh_raw.isna().sum().sort_values(ascending=False).head(10))


Dropped 0 duplicates
Cleaned behavior shape: (445581, 6)
Null counts after cleaning (top 10):
id          0
date        0
user        0
pc          0
filename    0
content     0
dtype: int64


In [3]:
df = df_beh_raw.copy()

if 'date' in df.columns:
    df['timestamp'] = pd.to_datetime(df['date'], errors='coerce')
    df['hour'] = df['timestamp'].dt.hour
    df['is_off_hours'] = df['hour'].isin(list(range(0,6)) + list(range(22,24))).astype(int)
else:
    df['is_off_hours'] = 0

df['content_len'] = df['content'].astype(str).str.len() if 'content' in df.columns else 0

features = (
    df.groupby('user')
      .agg(
          events=('id', 'count'),
          unique_pc=('pc', pd.Series.nunique),
          avg_content_len=('content_len', 'mean'),
          off_hours_ratio=('is_off_hours', 'mean'),
      )
      .reset_index()
).fillna(0)

print('Feature shape:', features.shape)
features.head()


Feature shape: (264, 5)


Unnamed: 0,user,events,unique_pc,avg_content_len,off_hours_ratio
0,AAF0535,357,1,337.826331,0.0
1,AAM0658,31,1,396.967742,1.0
2,ABC0174,589,1,364.078098,0.0
3,AHD0848,199,1,367.472362,0.0
4,AHM0410,2198,1,359.571884,0.0


In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

X = features.drop(columns=['user'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(contamination=0.05, random_state=42).fit(X_scaled)
iso_scores = -iso.score_samples(X_scaled)

ocsvm = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale').fit(X_scaled)
ocsvm_scores = -ocsvm.decision_function(X_scaled).ravel()

features['iso_score_norm'] = (iso_scores - iso_scores.min()) / (np.ptp(iso_scores) + 1e-9)
features['ocsvm_score_norm'] = (ocsvm_scores - ocsvm_scores.min()) / (np.ptp(ocsvm_scores) + 1e-9)

print('Top anomalies (ISO):')
print(features.sort_values('iso_score_norm', ascending=False).head())


Top anomalies (ISO):
        user  events  unique_pc  avg_content_len  off_hours_ratio  \
8    AJF0370   11053        417       368.255134         0.121777   
185  MPM0220    6689        244       351.635073         0.116759   
149  JTM0223       1          1       639.000000         0.000000   
20   BAL0044    2900        379       341.261379         0.391379   
108  HJB0742       5          1       267.200000         1.000000   

     iso_score_norm  ocsvm_score_norm  
8          1.000000          0.748800  
185        0.800712          0.384903  
149        0.771454          1.000000  
20         0.740368          0.386338  
108        0.727534          0.478723  


In [5]:
out_dir = project_root / 'experiments' / 'behavior'
out_dir.mkdir(parents=True, exist_ok=True)
scores_path = out_dir / 'scores_cert_unsupervised.csv'
features[['user', 'iso_score_norm', 'ocsvm_score_norm']].to_csv(scores_path, index=False)
print('Saved scores to', scores_path)


Saved scores to /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/experiments/behavior/scores_cert_unsupervised.csv
