In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.svm import OneClassSVM

In [4]:
import matplotlib.pyplot as plt

In [5]:
from sklearn.preprocessing import StandardScaler

In [40]:
from sklearn.model_selection import ParameterGrid

In [6]:
df = pd.read_csv('records_v2.csv')

In [7]:
print(df.columns.tolist())

['id', 'reading', 'reading_time', 'Fuel Volume (L)']


In [56]:
df['Fuel Volume (L)'] = pd.to_numeric(df['Fuel Volume (L)'], errors='coerce')

In [65]:
df['reading_time'] = pd.to_datetime(df['reading_time'], errors='coerce')

In [66]:
df = df.sort_values(['reading_time', 'id']).reset_index(drop=True)

In [67]:
df['volume_diff'] = df['Fuel Volume (L)'].diff()

In [68]:
df['volume_diff'] = df['Fuel Volume (L)'].diff()

In [69]:
df['prev_fuel_volume'] = df['Fuel Volume (L)'].shift(1)

In [70]:
df['rolling_mean'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).mean()
df['rolling_std'] = df['Fuel Volume (L)'].rolling(window=10, min_periods=5).std()

In [80]:
refill_after_zero = (df['prev_fuel_volume'] == 0) & (df['volume_diff'] > 0)

In [71]:
print("✅ Data preparation complete")
print(f"Dataset shape: {df.shape}")
print(f"Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std")

✅ Data preparation complete
Dataset shape: (1228, 12)
Features: volume_diff, prev_fuel_volume, rolling_mean, rolling_std


In [72]:
features = ["volume_diff", "rolling_mean", "rolling_std"]
X = df[features].values
y = df["true_label"].values  

In [73]:
split_idx = int(len(df) * 0.7)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [74]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [75]:
# remove rows where any feature is NaN
features = ["volume_diff", "rolling_mean", "rolling_std"]

df_clean = df.dropna(subset=features).copy()
df_clean = df_clean.sort_values("reading_time").reset_index(drop=True)

X = df_clean[features].values
y = df_clean["true_label"].values


In [76]:
print(df.columns)

Index(['id', 'reading', 'reading_time', 'Fuel Volume (L)', 'volume_diff',
       'prev_fuel_volume', 'rolling_mean', 'rolling_std', 'svmflag',
       'anomalyflag_corrected', 'anomalytypes', 'true_label'],
      dtype='object')


In [82]:
split_idx = int(len(df_clean) * 0.7)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ocsvm = OneClassSVM(kernel="rbf", nu=0.04, gamma="scale")
ocsvm.fit(X_train_scaled)


0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,nu,0.04
,shrinking,True
,cache_size,200
,verbose,False
,max_iter,-1


In [83]:
y_pred_raw = ocsvm.predict(X_test_scaled)
y_pred = (y_pred_raw == -1).astype(int) 

In [85]:
# Your exact anomaly classification function
def classify_anomaly(row, bigjump=2.0):
    if row['anomalyflag_corrected'] != -1:  # normal points
        return np.nan
    if row['volume_diff'] < -bigjump:
        return 'Sudden large drop'
    elif row['volume_diff'] > bigjump and row['prev_fuel_volume'] != 0:
        return 'Unexpected refill'
    elif abs(row['volume_diff']) < bigjump and row['prev_fuel_volume'] != 0:
        return 'Unusual small changes'
    elif row['Fuel Volume (L)'] < 0:
        return 'Negative volume (impossible)'
    else:
        return 'Extreme statistical outlier'

df['anomalytypes'] = df.apply(classify_anomaly, axis=1)
print("✅ Anomaly types classified")
print("\nAnomaly Type Distribution:")
print(df['anomalytypes'].value_counts(dropna=False))


✅ Anomaly types classified

Anomaly Type Distribution:
anomalytypes
NaN                            907
Unusual small changes          313
Sudden large drop                5
Extreme statistical outlier      2
Unexpected refill                1
Name: count, dtype: int64


In [86]:
from sklearn.metrics import accuracy_score, classification_report

# One‑Class SVM outputs: +1 = normal, -1 = anomaly
# Convert to 0/1 to match your labels (0 normal, 1 anomaly)

# ----- Training accuracy -----
y_train_raw = ocsvm.predict(X_train_scaled)
y_train_pred = (y_train_raw == -1).astype(int)

train_acc = accuracy_score(y_train, y_train_pred)
print("Training accuracy:", train_acc)
print("\nTraining classification report:")
print(classification_report(y_train, y_train_pred,
                            target_names=["Normal", "Anomaly"]))

# ----- Testing accuracy -----
y_test_raw = ocsvm.predict(X_test_scaled)
y_test_pred = (y_test_raw == -1).astype(int)

test_acc = accuracy_score(y_test, y_test_pred)
print("Testing accuracy:", test_acc)
print("\nTesting classification report:")
print(classification_report(y_test, y_test_pred,
                            target_names=["Normal", "Anomaly"]))


Training accuracy: 0.63572267920094

Training classification report:
              precision    recall  f1-score   support

      Normal       0.64      0.97      0.77       545
     Anomaly       0.44      0.05      0.09       306

    accuracy                           0.64       851
   macro avg       0.54      0.51      0.43       851
weighted avg       0.57      0.64      0.53       851

Testing accuracy: 0.9726027397260274

Testing classification report:
              precision    recall  f1-score   support

      Normal       1.00      0.97      0.99       350
     Anomaly       0.61      0.93      0.74        15

    accuracy                           0.97       365
   macro avg       0.80      0.95      0.86       365
weighted avg       0.98      0.97      0.98       365



In [84]:
print("Baseline accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred,
                            target_names=["Normal", "Anomaly"]))

Baseline accuracy: 0.9726027397260274

Classification report:
              precision    recall  f1-score   support

      Normal       1.00      0.97      0.99       350
     Anomaly       0.61      0.93      0.74        15

    accuracy                           0.97       365
   macro avg       0.80      0.95      0.86       365
weighted avg       0.98      0.97      0.98       365



In [15]:
feature_cols = ['volume_diff', 'rolling_mean', 'rolling_std']
X = df[feature_cols].dropna()

In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [29]:
from sklearn.svm import OneClassSVM

# 1) create the model  (NO random_state here)
ocsvm = OneClassSVM(kernel='rbf', nu=0.03, gamma='scale')

# 2) fit the model on your scaled features
ocsvm.fit(X_scaled)          # <-- this line is missing in your screenshot

# 3) then predict and save flags
df.loc[X.index, 'svmflag'] = ocsvm.predict(X_scaled)

In [31]:
# Start from SVM anomalies
is_anomaly = df['svmflag'] == -1

# Do NOT take records that are refill-after-zero
valid_anomaly_mask = is_anomaly & (~refill_after_zero)

# Final corrected flag according to your new rule
df['anomalyflag_corrected'] = 0
df.loc[valid_anomaly_mask, 'anomalyflag_corrected'] = 1   # 1 = anomaly, 0 = normal


In [32]:
legit_refill = (df['prev_fuel_volume'] == 0) & (df['volume_diff'] > 2)
df.loc[legit_refill, 'svmflag'] = 1  # Mark as normal

In [33]:
extreme_volume = (df['Fuel Volume (L)'] > df['rolling_mean'] + 3*df['rolling_std']) | \
                 (df['Fuel Volume (L)'] < df['rolling_mean'] - 3*df['rolling_std'])
df.loc[extreme_volume, 'svmflag'] = -1

In [34]:
negative_volume = df['Fuel Volume (L)'] < 0
df.loc[negative_volume, 'svmflag'] = -1

In [35]:
df['anomalyflag_corrected'] = df['svmflag']
print(f"Final corrected anomalies: {(df['anomalyflag_corrected'] == -1).sum()}")

Final corrected anomalies: 321


✅ Anomaly types classified

Anomaly Type Distribution:
anomalytypes
NaN                            907
Unusual small changes          313
Sudden large drop                5
Extreme statistical outlier      2
Unexpected refill                1
Name: count, dtype: int64


In [37]:
print(df.columns.tolist())

['id', 'reading', 'reading_time', 'Fuel Volume (L)', 'volume_diff', 'prev_fuel_volume', 'rolling_mean', 'rolling_std', 'svmflag', 'anomalyflag_corrected', 'anomalytypes', 'true_label']


In [38]:
# 0 = normal, 1 = anomaly
df['true_label'] = 0

# Example 1: treat all corrected anomalies as true anomalies
df.loc[df['anomalyflag_corrected'] == -1, 'true_label'] = 1

# OR Example 2: if you only trust certain types
# df.loc[df['anomalytypes'].isin(['Sudden large drop', 'Unexpected refill']), 'true_label'] = 1