In [47]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [48]:
# Load the dataset
df = pd.read_csv("/kaggle/input/obdii-ds3/exp1_14drivers_14cars_dailyRoutes.csv")

# Basic info
print(df.shape)  # rows, columns
print(df.columns)  # column names
print(df.head())  # first 5 rows

(60439, 33)
Index(['TIMESTAMP', 'MARK', 'MODEL', 'CAR_YEAR', 'ENGINE_POWER', 'AUTOMATIC',
       'VEHICLE_ID', 'BAROMETRIC_PRESSURE(KPA)', 'ENGINE_COOLANT_TEMP',
       'FUEL_LEVEL', 'ENGINE_LOAD', 'AMBIENT_AIR_TEMP', 'ENGINE_RPM',
       'INTAKE_MANIFOLD_PRESSURE', 'MAF', 'LONG TERM FUEL TRIM BANK 2',
       'FUEL_TYPE', 'AIR_INTAKE_TEMP', 'FUEL_PRESSURE', 'SPEED',
       'SHORT TERM FUEL TRIM BANK 2', 'SHORT TERM FUEL TRIM BANK 1',
       'ENGINE_RUNTIME', 'THROTTLE_POS', 'DTC_NUMBER', 'TROUBLE_CODES',
       'TIMING_ADVANCE', 'EQUIV_RATIO', 'MIN', 'HOURS', 'DAYS_OF_WEEK',
       'MONTHS', 'YEAR'],
      dtype='object')
      TIMESTAMP       MARK  MODEL  CAR_YEAR ENGINE_POWER AUTOMATIC VEHICLE_ID  \
0  1.502903e+12  chevrolet  agile    2011.0          1,4         n       car1   
1  1.502903e+12  chevrolet  agile    2011.0          1,4         n       car1   
2  1.502903e+12  chevrolet  agile    2011.0          1,4         n       car1   
3  1.502903e+12  chevrolet  agile    2011.0   

  df = pd.read_csv("/kaggle/input/obdii-ds3/exp1_14drivers_14cars_dailyRoutes.csv")


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60439 entries, 0 to 60438
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   TIMESTAMP                    47514 non-null  float64
 1   MARK                         47459 non-null  object 
 2   MODEL                        47459 non-null  object 
 3   CAR_YEAR                     47459 non-null  float64
 4   ENGINE_POWER                 47459 non-null  object 
 5   AUTOMATIC                    47459 non-null  object 
 6   VEHICLE_ID                   47514 non-null  object 
 7   BAROMETRIC_PRESSURE(KPA)     10212 non-null  float64
 8   ENGINE_COOLANT_TEMP          33964 non-null  float64
 9   FUEL_LEVEL                   2994 non-null   object 
 10  ENGINE_LOAD                  30972 non-null  object 
 11  AMBIENT_AIR_TEMP             3619 non-null   float64
 12  ENGINE_RPM                   33859 non-null  float64
 13  INTAKE_MANIFOLD_

In [50]:
df["mod_speed"] = df["SPEED"] / 220   # assuming 220 km/h max
df["mod_rpm"] = df["ENGINE_RPM"] / 8000  # assuming 8000 rpm max
df["speed_rpm_ratio"] = df["mod_speed"] / df["mod_rpm"]
df["acceleration"] = df["SPEED"].diff() / 3.6  # rough estimate, converting to m/s²


In [51]:
def clean_numeric(col):
    """Remove %, replace comma with dot, convert to float"""
    return pd.to_numeric(col.astype(str).str.replace("%", "").str.replace(",", "."), errors="coerce")

# Apply cleaning to all columns used in X
for col in ["ENGINE_LOAD", "THROTTLE_POS", "ENGINE_RPM", "SPEED", "speed_rpm_ratio", "acceleration"]:
    df[col] = clean_numeric(df[col])


In [52]:
# Select features
X = df[["ENGINE_LOAD", "ENGINE_RPM", "SPEED", "THROTTLE_POS", "speed_rpm_ratio", "acceleration"]]

# Scale values so they’re comparable
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.fillna(0))

# Apply K-Means with 2 clusters (safe vs aggressive)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)  # explicitly set n_init
df["cluster"] = kmeans.fit_predict(X_scaled)

In [53]:
print(df["cluster"].value_counts())

cluster
0    37737
1    22702
Name: count, dtype: int64


In [54]:
print(df.groupby("cluster")[["ENGINE_RPM", "SPEED", "ENGINE_LOAD", "THROTTLE_POS"]].mean())

          ENGINE_RPM      SPEED  ENGINE_LOAD  THROTTLE_POS
cluster                                                   
0         941.261533   2.270743    33.772596     12.966420
1        1803.045515  48.601843    43.331894     20.560901


In [55]:
X = df[["ENGINE_LOAD", "ENGINE_RPM", "SPEED", "THROTTLE_POS", "speed_rpm_ratio", "acceleration"]].copy()

for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')


print(X.isna().sum())

ENGINE_LOAD        29467
ENGINE_RPM         26580
SPEED              13910
THROTTLE_POS       26580
speed_rpm_ratio    27144
acceleration       14546
dtype: int64


In [56]:
for col in X.columns:
    # Find rows that cannot be converted to float
    invalid = X[~X[col].apply(lambda x: isinstance(x, (int, float)))]
    if not invalid.empty:
        print(f"Column {col} has invalid values:")
        print(invalid[col])


In [57]:
X = df[["ENGINE_LOAD", "ENGINE_RPM", "SPEED", "THROTTLE_POS", "speed_rpm_ratio", "acceleration"]].copy()
y = df["cluster"].copy()

# Convert all to numeric (force any strings to NaN)
X = X.apply(pd.to_numeric, errors='coerce')

# Drop any rows that have NaN in features or labels
mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

In [58]:
print(X.isna().sum())
print(y.isna().sum())


ENGINE_LOAD        0
ENGINE_RPM         0
SPEED              0
THROTTLE_POS       0
speed_rpm_ratio    0
acceleration       0
dtype: int64
0


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.982448866777225
Confusion Matrix:
 [[1896   69]
 [  58 5213]]
