Import Required Libraries

In [1]:
import pandas as pd
import numpy as np


Load the Raw Dataset

In [2]:
df = pd.read_csv("/content/component4_performance_dataset (3).csv")


Create a Working Copy

In [3]:
df_clean = df.copy()


Initial Structure Check

In [4]:
df_clean.shape
df_clean.head()
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2683 entries, 0 to 2682
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   vehicle_id     2683 non-null   int64  
 1   duration       2683 non-null   float64
 2   waiting_time   2683 non-null   float64
 3   route_length   2683 non-null   float64
 4   time_loss      2683 non-null   float64
 5   avg_speed      2683 non-null   float64
 6   waiting_ratio  2683 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 146.9 KB


Handle Duplicate Records

In [5]:
# Check duplicates
duplicates = df_clean.duplicated().sum()
print("Duplicate rows:", duplicates)

# Remove duplicates if present
df_clean.drop_duplicates(inplace=True)


Duplicate rows: 0


Missing Value Analysis

In [6]:
df_clean.isnull().sum()


Unnamed: 0,0
vehicle_id,0
duration,0
waiting_time,0
route_length,0
time_loss,0
avg_speed,0
waiting_ratio,0


In [8]:
# Numerical columns
num_cols = df_clean.select_dtypes(include=["int64", "float64"]).columns

for col in num_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())


# Categorical columns
cat_cols = df_clean.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])


Data Type Validation & Correction

In [9]:
for col in num_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")


In [10]:
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2683 entries, 0 to 2682
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   vehicle_id     2683 non-null   int64  
 1   duration       2683 non-null   float64
 2   waiting_time   2683 non-null   float64
 3   route_length   2683 non-null   float64
 4   time_loss      2683 non-null   float64
 5   avg_speed      2683 non-null   float64
 6   waiting_ratio  2683 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 146.9 KB


Logical Constraint Validation

In [11]:
for col in num_cols:
    df_clean = df_clean[df_clean[col] >= 0]


Outlier Treatment (IQR Capping â€“ FIXED TIME APPROACH)

In [12]:
def iqr_cap(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[column] = np.where(
        df[column] < lower, lower,
        np.where(df[column] > upper, upper, df[column])
    )
    return df


Apply to All Numerical Columns

In [13]:
for col in num_cols:
    df_clean = iqr_cap(df_clean, col)


Categorical Data Standardization

In [14]:
for col in cat_cols:
    df_clean[col] = df_clean[col].str.strip().str.lower()


Feature Scaling (Post-Cleaning Only)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])


Final Clean Dataset Validation

In [16]:
df_clean.shape
df_clean.info()
df_clean.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2683 entries, 0 to 2682
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   vehicle_id     2683 non-null   float64
 1   duration       2683 non-null   float64
 2   waiting_time   2683 non-null   float64
 3   route_length   2683 non-null   float64
 4   time_loss      2683 non-null   float64
 5   avg_speed      2683 non-null   float64
 6   waiting_ratio  2683 non-null   float64
dtypes: float64(7)
memory usage: 146.9 KB


Unnamed: 0,vehicle_id,duration,waiting_time,route_length,time_loss,avg_speed,waiting_ratio
count,2683.0,2683.0,2683.0,2683.0,2683.0,2683.0,2683.0
mean,-8.474606e-17,1.059326e-16,-1.0593260000000002e-17,3.965851e-16,-1.906786e-16,-2.012719e-16,-1.059326e-16
std,1.000186,1.000186,1.000186,1.000186,1.000186,1.000186,1.000186
min,-1.584887,-1.630813,-0.7541511,-2.150651,-1.011954,-2.343509,-0.8209951
25%,-0.8594538,-0.7259155,-0.7541511,-0.7604248,-0.7374089,-0.5513972,-0.8209951
50%,-0.09080752,-0.2474096,-0.4937297,-0.05426467,-0.4352191,0.1104456,-0.4163284
75%,0.7988344,0.4513983,0.330938,0.6708448,0.3463408,0.6433444,0.4554717
max,2.261045,2.217369,1.958572,2.817749,1.971965,2.435457,2.370172


Save Cleaned Dataset


In [17]:
df_clean.to_csv("component4_cleaned_dataset.csv", index=False)
