In [5]:
import pandas as pd
import numpy as np
import os
import joblib

# --- 1. Load All Data (Train + Test) ---

DATA_PATH = "../data/UCI HAR Dataset/"

# --- 1a. Load and Fix Feature Names ---
print("Loading feature names...")
features_df = pd.read_csv(DATA_PATH + "features.txt", sep=' ', header=None, names=['FeatureID', 'FeatureName'])

# Get the list of names
names = features_df['FeatureName'].tolist()

# Create a new, unique list of names
unique_names = []
counts = {} # Dictionary to keep track of how many times we've seen a name

for name in names:
    if name not in counts:
        # If it's the first time, just add it
        unique_names.append(name)
        counts[name] = 0
    else:
        # If it's a duplicate, add a suffix
        counts[name] += 1
        unique_names.append(f"{name}_{counts[name]}")

print(f"Fixed {sum(counts.values())} duplicate names.")

# --- 1b. Load Data Using UNIQUE Names ---

# Load X_train
print("Loading X_train...")
X_train = pd.read_csv(
    DATA_PATH + "train/X_train.txt", 
    sep='\s+', 
    header=None, 
    names=unique_names  # Use our new unique list
)

# Load X_test
print("Loading X_test...")
X_test = pd.read_csv(
    DATA_PATH + "test/X_test.txt", 
    sep='\s+', 
    header=None, 
    names=unique_names  # Use our new unique list
)

# Combine them into one big "NORMAL" dataset
X_normal = pd.concat([X_train, X_test], ignore_index=True)

print(f"\nSuccess! Loaded and combined all data.")
print(f"Total normal samples: {X_normal.shape}")
X_normal.head()

Loading feature names...
Fixed 84 duplicate names.
Loading X_train...
Loading X_test...

Success! Loaded and combined all data.
Total normal samples: (10299, 561)


Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [6]:
from sklearn.ensemble import IsolationForest

print("Initializing the Isolation Forest model...")

# --- Create the Model ---
# 'contamination=0.01' is our best guess. It tells the model
# that we expect about 1% of our *new* data in the future
# to be anomalies (e.g., falls). It helps set the sensitivity.
model_anomaly = IsolationForest(
    n_estimators=100,  # 100 trees
    contamination=0.01,
    random_state=42
)

print("Training the anomaly model on all 10,299 'normal' samples...")

# We only give it 'X_normal'. We're not giving it any "answers"
model_anomaly.fit(X_normal)

print("Anomaly model training complete!")

Initializing the Isolation Forest model...
Training the anomaly model on all 10,299 'normal' samples...
Anomaly model training complete!


In [7]:
# --- 3. Test the Anomaly Model ---

print("Testing the anomaly model...")

# --- Test 1: A Normal Sample ---
# Grab the first row from our normal data
# .reshape(1, -1) is needed to make it a "batch" of 1 row
normal_sample = X_normal.iloc[0].values.reshape(1, -1)

# Make a prediction on the normal sample
prediction_normal = model_anomaly.predict(normal_sample)


# --- Test 2: A Fake Anomaly Sample ---
# We'll create a fake row that is all zeros
# This is clearly not normal sensor data
fake_anomaly = np.zeros((1, X_normal.shape[1]))

# Make a prediction on the fake anomaly
prediction_anomaly = model_anomaly.predict(fake_anomaly)


# --- Show the Results ---
print("\n--- RESULTS ---")
print("Prediction Key: [ 1] = Normal, [-1] = Anomaly")
print(f"\nPrediction for NORMAL sample: {prediction_normal}")
print(f"Prediction for FAKE ANOMALY: {prediction_anomaly}")

Testing the anomaly model...

--- RESULTS ---
Prediction Key: [ 1] = Normal, [-1] = Anomaly

Prediction for NORMAL sample: [1]
Prediction for FAKE ANOMALY: [-1]


