In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from sklearn.svm import OneClassSVM

In [5]:
import matplotlib.pyplot as plt

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
from sklearn.model_selection import ParameterGrid

In [8]:
df = pd.read_csv('records_v2.csv')

In [10]:
print(df.columns.tolist())

['id', 'reading', 'reading_time', 'Fuel Volume (L)']


In [11]:
df['Fuel Volume (L)'] = pd.to_numeric(df['Fuel Volume (L)'], errors='coerce')

In [12]:
df['reading_time'] = pd.to_datetime(df['reading_time'], errors='coerce')

In [13]:
df = df.sort_values(['reading_time', 'id']).reset_index(drop=True)

In [14]:
df['volume_diff'] = df['Fuel Volume (L)'].diff()

In [15]:
df['prev_fuel_volume'] = df['Fuel Volume (L)'].shift(1)

In [16]:
df['rolling_mean'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).mean()
df['rolling_std'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).std()

In [17]:
# 4. Business-rule anomaly flag (same logic you used with Isolation Forest) --
def rule_based_flag(row, bigjump=2.0):
    """
    Returns 1 if anomaly according to business rules, else 0.
    bigjump is the threshold in liters for a 'sudden change'.
    """
    # Start as normal
    flag = 0

    # Rule 1: Negative volume is impossible
    if row["Fuel Volume (L)"] < 0:
        return 1

    # Rule 2: sudden large drop (possible fraud / leak)
    if row["volume_diff"] < -bigjump:
        return 1

    # Rule 3: unexpected refill spike (jump up when tank should not refill)
    if row["volume_diff"] > bigjump and row["prev_fuel_volume"] not in [0, np.nan]:
        return 1

    # Rule 4 (optional): unusual tiny oscillations when tank should be stable
    # Uncomment this if you treated this as anomaly in the Isolation Forest logic
    # if abs(row["volume_diff"]) < bigjump and row["prev_fuel_volume"] not in [0, np.nan]:
    #     return 1

    return flag

df["anomalyflag_corrected"] = df.apply(rule_based_flag, axis=1).astype(int)
df["true_label"] = df["anomalyflag_corrected"]  # 0 = normal, 1 = anomaly

print("✅ Business rules applied. Anomaly counts:")
print(df["true_label"].value_counts())


✅ Business rules applied. Anomaly counts:
true_label
0    1222
1       6
Name: count, dtype: int64


In [22]:
features = ["volume_diff", "prev_fuel_volume", "rolling_mean", "rolling_std"]

df_clean = df.dropna(subset=features + ["true_label"]).copy()
df_clean = df_clean.sort_values("reading_time").reset_index(drop=True)

X = df_clean[features].values
y = df_clean["true_label"].values     # 0 normal, 1 anomaly

split_idx = int(len(df_clean) * 0.7)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]


In [23]:
print("✅ Data preparation complete")
print(f"Dataset shape: {df.shape}")
print(f"Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std")

✅ Data preparation complete
Dataset shape: (1228, 10)
Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std


In [24]:
split_idx = int(len(df) * 0.7)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
print(df.columns)

Index(['id', 'reading', 'reading_time', 'Fuel Volume (L)', 'volume_diff',
       'prev_fuel_volume', 'rolling_mean', 'rolling_std',
       'anomalyflag_corrected', 'true_label'],
      dtype='object')


In [27]:
# 8. Train One-Class SVM on rule-clean NORMAL data only ---------------------
# use only training samples that are normal according to business rules
X_ref = X_train_scaled[y_train == 0]

ocsvm = OneClassSVM(kernel="rbf", nu=0.04, gamma="scale")
ocsvm.fit(X_ref)

print("✅ One-Class SVM fitted on rule-clean normal data")


✅ One-Class SVM fitted on rule-clean normal data


In [28]:
# 9. Get SVM anomaly predictions for train and test -------------------------
# +1 = normal, -1 = anomaly -> convert to 0/1
y_train_raw_svm = ocsvm.predict(X_train_scaled)
svm_train_flag  = (y_train_raw_svm == -1).astype(int)

y_test_raw_svm = ocsvm.predict(X_test_scaled)
svm_test_flag  = (y_test_raw_svm == -1).astype(int)


In [29]:
# 10. Combine business rules + SVM (final anomaly labels) -------------------
# Any point flagged by rules OR SVM is considered anomaly
final_train_flag = np.where(
    (y_train == 1) | (svm_train_flag == 1),
    1,
    0
)

final_test_flag = np.where(
    (y_test == 1) | (svm_test_flag == 1),
    1,
    0
)

# Store back into df_clean for inspection
df_clean.loc[:split_idx-1, "final_anomaly"] = final_train_flag
df_clean.loc[split_idx:,   "final_anomaly"] = final_test_flag


In [32]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

# 11. Evaluate training & testing accuracy ----------------------------------
print("Training accuracy (rules vs final_anomaly):",
      accuracy_score(y_train, final_train_flag))
print("\nTraining classification report:")
print(classification_report(y_train, final_train_flag,
                            target_names=["Normal", "Anomaly"]))

print("Testing accuracy (rules vs final_anomaly):",
      accuracy_score(y_test, final_test_flag))
print("\nTesting classification report:")
print(classification_report(y_test, final_test_flag,
                            target_names=["Normal", "Anomaly"]))


Training accuracy (rules vs final_anomaly): 0.619324796274738

Training classification report:
              precision    recall  f1-score   support

      Normal       1.00      0.62      0.76       857
     Anomaly       0.01      1.00      0.01         2

    accuracy                           0.62       859
   macro avg       0.50      0.81      0.39       859
weighted avg       1.00      0.62      0.76       859

Testing accuracy (rules vs final_anomaly): 0.9327731092436975

Testing classification report:
              precision    recall  f1-score   support

      Normal       1.00      0.93      0.96       353
     Anomaly       0.14      1.00      0.25         4

    accuracy                           0.93       357
   macro avg       0.57      0.97      0.61       357
weighted avg       0.99      0.93      0.96       357

