In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.18.0-py3-none-any.whl.metadata (13 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.36.21-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.36.21-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.0 (from sdv)
  Downloading copulas-0.12.1-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.10.2 (from sdv)
  Downloading ctgan-0.10.2-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.6.1 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.13.2 (from sdv)
  Downloading rdt-1.14.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.17.0 (from sdv)
  Downloading sdmetrics-0.18.0-py3-none-any.whl.metadata (8.8 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [2]:
import numpy as np
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from datetime import datetime, timedelta


In [3]:
# Sensor Specifications
PL, PH = 10.0, 115.0  # Pressure range (kPa)
OL, OH = 0.400, 4.650  # Output voltage range (V)
A, B = 0.008095, -0.000952  # Gain and offset
VDD = 5.0  # Supply voltage

In [4]:
# Define vehicle states with expected ranges
vehicle_states = [
    ("Engine OFF", (95, 105), (4.5, 4.9)),
    ("Idle", (20, 35), (0.9, 1.5)),
    ("Acceleration", (40, 70), (2.0, 3.5)),
    ("Throttle Release", (10, 30), (0.5, 1.3)),
    ("Wide Open Throttle", (95, 101), (4.0, 4.8))
]

In [5]:
# Generate initial real-like data
num_samples = 5000
pressure_values = np.random.uniform(PL, PH, num_samples) + np.random.normal(0, 2, num_samples)  # Adding noise to pressure
voltage_values = ((A * pressure_values + B) * VDD) + np.random.normal(0, 0.1, num_samples)  # Adding noise to voltage

In [6]:
# Ensure some voltage values are exactly 0 or 5
num_special_cases = int(0.15 * num_samples)
special_indices = np.random.choice(num_samples, num_special_cases, replace=False)
voltage_values[special_indices[:num_special_cases // 3]] = 0.0
voltage_values[special_indices[num_special_cases // 3:num_special_cases * 2 // 3]] = 5.0

In [7]:
# Assign vehicle states
def get_vehicle_state(pressure, voltage):
    min_distance = float("inf")
    closest_state = "Unknown"

    for state, p_range, v_range in vehicle_states:
        p_mid = sum(p_range) / 2
        v_mid = sum(v_range) / 2
        distance = abs(pressure - p_mid) + abs(voltage - v_mid)

        if distance < min_distance:
            min_distance = distance
            closest_state = state

    return closest_state

In [8]:
vehicle_state_values = [get_vehicle_state(p, v) for p, v in zip(pressure_values, voltage_values)]


In [9]:
# Assign fault types
def get_fault_type(pressure, voltage):
    if voltage == 0.0:
        return "Wiring Issue"
    elif voltage == 5.0:
        return "Power Supply Issue"
    elif pressure < PL or pressure > PH:
        return "Sensor Fault"
    return "Normal"

In [10]:
fault_types = [get_fault_type(p, v) for p, v in zip(pressure_values, voltage_values)]


In [22]:
# Balance fault conditions
fault_counts = {"Normal": 2000, "Sensor Fault": 2000, "Wiring Issue": 2000, "Power Supply Issue": 2000}
selected_indices = []

for fault, count in fault_counts.items():
    indices = [i for i, f in enumerate(fault_types) if f == fault]
    if len(indices) >= count:
        selected_indices.extend(np.random.choice(indices, count, replace=False))
    else:
        selected_indices.extend(indices)
        additional_indices = np.random.choice(indices, count - len(indices), replace=True)
        selected_indices.extend(additional_indices)

pressure_values = np.array(pressure_values)[selected_indices]
voltage_values = np.array(voltage_values)[selected_indices]
vehicle_state_values = np.array(vehicle_state_values)[selected_indices]
fault_types = np.array(fault_types)[selected_indices]


In [23]:
# Reduce normal condition
normal_indices = [i for i, f in enumerate(fault_types) if f == "Normal"]
fault_indices = [i for i, f in enumerate(fault_types) if f != "Normal"]
reduce_normal = int(0.4 * len(normal_indices))
selected_normals = np.random.choice(normal_indices, len(normal_indices) - reduce_normal, replace=False)
selected_indices = np.concatenate((selected_normals, fault_indices))

pressure_values = np.array(pressure_values)[selected_indices]
voltage_values = np.array(voltage_values)[selected_indices]
vehicle_state_values = np.array(vehicle_state_values)[selected_indices]
fault_types = np.array(fault_types)[selected_indices]

In [24]:
# Generate timestamps
start_time = datetime.now()
timestamps = [start_time + timedelta(seconds=i) for i in range(len(selected_indices))]

In [25]:
# Create DataFrame
real_data = pd.DataFrame({
    "Timestamp": timestamps,
    "Pressure (kPa)": pressure_values,
    "Voltage (V)": voltage_values,
    "Vehicle State": vehicle_state_values,
    "Fault Type": fault_types
})

In [26]:
# Define metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_data)
metadata.update_column("Vehicle State", sdtype="categorical")
metadata.update_column("Fault Type", sdtype="categorical")


In [27]:
# Train CTGAN model
ctgan = CTGANSynthesizer(metadata=metadata)
ctgan.fit(real_data)




In [28]:
# Generate synthetic data
synthetic_data = ctgan.sample(10000)

In [29]:
# Ensure only valid vehicle states exist in synthetic data
valid_states = {state[0] for state in vehicle_states}
synthetic_data = synthetic_data[synthetic_data["Vehicle State"].isin(valid_states)]

In [30]:
# Ensure fault types include all expected categories
valid_faults = {"Normal", "Sensor Fault", "Wiring Issue", "Power Supply Issue"}
synthetic_data = synthetic_data[synthetic_data["Fault Type"].isin(valid_faults)]


In [31]:
# Display sample dataset
print(synthetic_data.head())


                   Timestamp  Pressure (kPa)  Voltage (V)     Vehicle State  \
0 2025-02-15 09:05:36.630369       92.079675     0.374314  Throttle Release   
1 2025-02-15 08:56:44.746994       64.540842     4.585941      Acceleration   
2 2025-02-15 08:33:31.536049       18.759910     1.664302  Throttle Release   
3 2025-02-15 09:53:47.749229       25.466888     4.927554              Idle   
4 2025-02-15 09:29:16.698882       10.509797     0.494690  Throttle Release   

     Fault Type  
0  Sensor Fault  
1        Normal  
2        Normal  
3  Wiring Issue  
4  Sensor Fault  


In [32]:
from google.colab import files

# Save synthetic data to CSV
synthetic_data.to_csv("synthetic_dataset.csv", index=False)

# Download the file
files.download("synthetic_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>