In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
# Load engineered features
features_df = pd.read_csv("user_features.csv")

print(features_df.head())
print(features_df.info())

      user  total_logins  unique_days  after_hours  unique_machines  \
0  AAE0190            37           19            0                1   
1  AAF0535            37           19            0                1   
2  AAF0791            37           19            0                1   
3  AAL0706            37           19           19                1   
4  AAM0658            37           19           18                1   

   usb_count  usb_days          employee_name  user_id   O   C   E   A   N  
0        0.0       0.0   August Armando Evans  AAE0190  36  30  14  50  29  
1        0.0       0.0  Athena Amelia Foreman  AAF0535  17  21  36  33  31  
2        0.0       0.0  Aladdin Abraham Foley  AAF0791  14  40  40  50  34  
3        0.0       0.0       April Alika Levy  AAL0706  37  14  28  13  25  
4        0.0       0.0       Abel Adam Morton  AAM0658  43  35  37  36  22  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   

In [19]:
# Prepare data for modeling
print("Original columns:", features_df.columns.tolist())
print("Data types:")
print(features_df.dtypes)

# Drop common identifier/string columns first (so downstream selection is numeric-only)
drop_candidates = [c for c in ["user_id", "user", "employee_name", "employee"] if c in features_df.columns]
if drop_candidates:
    print("Dropping identifier columns:", drop_candidates)
features_clean = features_df.drop(columns=drop_candidates, errors="ignore")

# Select only numeric columns for modeling
numeric_columns = features_clean.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns found: {numeric_columns}")

# Create feature matrix (only numeric columns)
X = features_clean[numeric_columns].copy()
print(f"Feature matrix shape: {X.shape}")

# Safety check: ensure no object columns remain
obj_cols = X.select_dtypes(include=['object']).columns.tolist()
if obj_cols:
    raise ValueError(f"Non-numeric columns still present in X: {obj_cols}")

# Scale the numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data successfully scaled!")
print(f"Scaled data shape: {X_scaled.shape}")

Original columns: ['user', 'total_logins', 'unique_days', 'after_hours', 'unique_machines', 'usb_count', 'usb_days', 'employee_name', 'user_id', 'O', 'C', 'E', 'A', 'N']
Data types:
user                object
total_logins         int64
unique_days          int64
after_hours          int64
unique_machines      int64
usb_count          float64
usb_days           float64
employee_name       object
user_id             object
O                    int64
C                    int64
E                    int64
A                    int64
N                    int64
dtype: object
Dropping identifier columns: ['user_id', 'user', 'employee_name']

Numeric columns found: ['total_logins', 'unique_days', 'after_hours', 'unique_machines', 'usb_count', 'usb_days', 'O', 'C', 'E', 'A', 'N']
Feature matrix shape: (1000, 11)
Data successfully scaled!
Scaled data shape: (1000, 11)
