In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.svm import OneClassSVM

In [4]:
import matplotlib.pyplot as plt

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
df = pd.read_csv('records_v2.csv')

In [11]:
print(df.columns.tolist())

['id', 'reading', 'reading_time', 'Fuel Volume (L)']


In [12]:
df['Fuel Volume (L)'] = pd.to_numeric(df['Fuel Volume (L)'], errors='coerce')

In [13]:
df['reading_time'] = pd.to_datetime(df['reading_time'], errors='coerce')

In [14]:
df = df.sort_values(['reading_time', 'id']).reset_index(drop=True)

In [15]:
df['volume_diff'] = df['Fuel Volume (L)'].diff()

In [16]:
df['prev_fuel_volume'] = df['Fuel Volume (L)'].shift(1)

In [17]:
df['rolling_mean'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).mean()
df['rolling_std'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).std()

In [18]:
print("✅ Data preparation complete")
print(f"Dataset shape: {df.shape}")
print(f"Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std")

✅ Data preparation complete
Dataset shape: (1228, 8)
Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std


In [19]:
feature_cols = ['volume_diff', 'rolling_mean', 'rolling_std']
X = df[feature_cols].dropna()

In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
from sklearn.svm import OneClassSVM

# 1) create the model  (NO random_state here)
ocsvm = OneClassSVM(kernel='rbf', nu=0.03, gamma='scale')

# 2) fit the model on your scaled features
ocsvm.fit(X_scaled)          # <-- this line is missing in your screenshot

# 3) then predict and save flags
df.loc[X.index, 'svmflag'] = ocsvm.predict(X_scaled)

In [25]:
legit_refill = (df['prev_fuel_volume'] == 0) & (df['volume_diff'] > 2)
df.loc[legit_refill, 'svmflag'] = 1  # Mark as normal

In [26]:
extreme_volume = (df['Fuel Volume (L)'] > df['rolling_mean'] + 3*df['rolling_std']) | \
                 (df['Fuel Volume (L)'] < df['rolling_mean'] - 3*df['rolling_std'])
df.loc[extreme_volume, 'svmflag'] = -1

In [27]:
negative_volume = df['Fuel Volume (L)'] < 0
df.loc[negative_volume, 'svmflag'] = -1

In [28]:
df['anomalyflag_corrected'] = df['svmflag']
print(f"Final corrected anomalies: {(df['anomalyflag_corrected'] == -1).sum()}")

Final corrected anomalies: 321


In [29]:
# Your exact anomaly classification function
def classify_anomaly(row, bigjump=2.0):
    if row['anomalyflag_corrected'] != -1:  # normal points
        return np.nan
    if row['volume_diff'] < -bigjump:
        return 'Sudden large drop'
    elif row['volume_diff'] > bigjump and row['prev_fuel_volume'] != 0:
        return 'Unexpected refill'
    elif abs(row['volume_diff']) < bigjump and row['prev_fuel_volume'] != 0:
        return 'Unusual small changes'
    elif row['Fuel Volume (L)'] < 0:
        return 'Negative volume (impossible)'
    else:
        return 'Extreme statistical outlier'

df['anomalytypes'] = df.apply(classify_anomaly, axis=1)
print("✅ Anomaly types classified")
print("\nAnomaly Type Distribution:")
print(df['anomalytypes'].value_counts(dropna=False))


✅ Anomaly types classified

Anomaly Type Distribution:
anomalytypes
NaN                            907
Unusual small changes          313
Sudden large drop                5
Extreme statistical outlier      2
Unexpected refill                1
Name: count, dtype: int64


In [33]:
print(df.columns.tolist())

['id', 'reading', 'reading_time', 'Fuel Volume (L)', 'volume_diff', 'prev_fuel_volume', 'rolling_mean', 'rolling_std', 'svmflag', 'anomalyflag_corrected', 'anomalytypes']


In [None]:
# 0 = normal, 1 = anomaly
df['true_label'] = 0

# Example 1: treat all corrected anomalies as true anomalies
df.loc[df['anomalyflag_corrected'] == -1, 'true_label'] = 1

# OR Example 2: if you only trust certain types
# df.loc[df['anomalytypes'].isin(['Sudden large drop', 'Unexpected refill']), 'true_label'] = 1