# Data Validation

This notebook will be used for data validation.
1. Pre-processing data (e.g., scale and split into train & test)
2. Validate data (screating schema)

1. Pre-processing data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate, RandomizedSearchCV

In [6]:
ttc = pd.read_csv('data/ttc-bus-delay-data-2024.csv', parse_dates=['Date'])
ttc.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/ttc-bus-delay-data-2024.csv'

In [4]:
ttc1 = ttc.copy()
ttc1['Time'] = pd.to_datetime(ttc['Time']).dt.time
ttc1['Date_'] = ttc1['Date'].dt.date
ttc1['Month'] = ttc1['Date'].dt.month
ttc1['Hour'] = ttc1['Time'].map(lambda x: x.hour)
ttc1 = ttc1.drop(columns=['Date', 'Time'])
ttc_clean = ttc1.drop(columns=['Direction', 'Vehicle'])
ttc_clean = ttc_clean.dropna()
ttc_clean.isna().sum()

  ttc1['Time'] = pd.to_datetime(ttc['Time']).dt.time


Route        0
Day          0
Location     0
Incident     0
Min Delay    0
Min Gap      0
Date_        0
Month        0
Hour         0
dtype: int64

In [5]:
ttc_clean.reset_index(drop=True, inplace=True)

In [6]:
ttc_lr = ttc_clean.loc[(ttc_clean["Min Delay"]<30) & (ttc_clean["Min Delay"]>0)].reset_index(drop=True)

In [7]:
#Split dataset into target and features
X = ttc_lr[["Route","Incident","Location","Day","Hour","Month"]]
y = ttc_lr['Min Delay']

In [8]:
numeric_features=["Hour","Month"]
categorical_features = ['Location', 'Route', 'Incident',"Day"]

In [9]:
#Create transformers and preprocessing pipeline
preprocessor = ColumnTransformer(
   transformers=[
       ('num', StandardScaler(), numeric_features),
       ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
   ]
)
preprocessor

In [10]:
#Create Model pipeline
model_pipeline = Pipeline(steps=[
   ('preprocessor', preprocessor),
   ('model', LogisticRegression(random_state=123, max_iter=2000))
])
model_pipeline

In [30]:
#Split dataset into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [31]:
cv_pipe=cross_validate(model_pipeline, X_train, y_train, cv=5, return_train_score=True)
results_df=pd.DataFrame(pd.DataFrame(cv_pipe))
results_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,35.125873,0.047698,0.361283,0.491468
1,10.296951,0.045638,0.368906,0.48872
2,10.234558,0.049576,0.364829,0.493995
3,10.272444,0.053283,0.363056,0.491424
4,9.18912,0.045144,0.367553,0.490028


2.Validate data (screating schema)

Correct data file format

In [1]:
import os
import pandas as pd

file_path = 'data/ttc-bus-delay-data-2024.csv'

if os.path.exists(file_path) and file_path.endswith('.csv'):
    try:
        # Load the CSV file and parse dates
        ttc = pd.read_csv(file_path, parse_dates=['Date'])
        print("File loaded successfully!")
    except Exception as e:
        print(f"Error loading file: {e}")
else:
    print("Error: File is either missing or not in CSV format.")

File loaded successfully!


  ttc = pd.read_csv(file_path, parse_dates=['Date'])


The warning suggests that the pd.read_csv() method couldn't infer a uniform date format for the Date column. This happens when the column contains inconsistent date formats or unparseable values. This was fixed when we used parse_dates=['Date'] in ttc = pd.read_csv('data/ttc-bus-delay-data-2024.csv', parse_dates=['Date'])

Schema

In [11]:
import pandera as pa
from pandera.typing import Series
from pandera.typing import DataFrame

In [None]:
# validate data
schema = pa.DataFrameSchema(
    {
        "Date": pa.Column(str, checks=[pa.Check.str_matches(r"\d{4}-\d{2}-\d{2}", error="Date must match YYYY-MM-DD format")]),
        "Route": pa.Column(int, checks=[pa.Check.ge(0, error="Route must be a non-negative integer")]),
        "Time": pa.Column(str, checks=[pa.Check.str_matches(r"\d{2}:\d{2}", error="Time must match HH:MM format")]),
        "Day": pa.Column(str, checks=[pa.Check.isin(
                    ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
                    error="Day must be a valid weekday")]),
        "Location": pa.Column(str),
        "Incident": pa.Column(str, checks=[pa.Check.isin(
                    ["Cleaning - Unsanitary", "Collision - TTC", "Mechanical", "Operations - Operator", "Diversion", "Emergency Services", "Utilized Off Route", "Investigation", "Road Blocked - NON-TTC Collision", "Vision", "General Delay", "Security"], # known incident types
                    error="Incident must be one of: Vision, General Delay, Security")]),
        "Min Delay": pa.Column(int, checks=[pa.Check.ge(0, error="Min Delay must be non-negative"),pa.Check.le(1440, error="Min Delay must not exceed 1440 minutes (24 hours)")]),
        "Min Gap": pa.Column(int, checks=[pa.Check.ge(0, error="Min Gap must be non-negative"),pa.Check.le(1440, error="Min Gap must not exceed 1440 minutes (24 hours)")]),
        "Direction": pa.Column(str, checks=[pa.Check.isin(["N", "S", "E", "W", None], error="Direction must be N, S, E, W, or missing")],nullable=True),
        "Vehicle": pa.Column(int, checks=[pa.Check.ge(0, error="Vehicle must be a non-negative integer")]),
    },
    checks=[
        pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found"),
        pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found"),
        pa.Check(lambda df: df["Min Delay"].isna().mean() <= 0.05, error="Min Delay missingness exceeds 5%"),
        pa.Check(lambda df: df["Min Gap"].isna().mean() <= 0.05, error="Min Gap missingness exceeds 5%"),
    ],
)

# additional data validation for the dataframe
def additional_validations(df: DataFrame):
    # check for correct column names
    required_columns = {"Date", "Route", "Time", "Day", "Location", "Incident", "Min Delay", "Min Gap", "Direction", "Vehicle"}
    missing_columns = required_columns - set(df.columns)
    assert not missing_columns, f"Missing columns: {missing_columns}"

    # check for outliers (e.g., Min Delay or Min Gap unreasonably large)
    assert (df["Min Delay"] <= 1440).all(), "Outlier found in Min Delay"
    assert (df["Min Gap"] <= 1440).all(), "Outlier found in Min Gap"

    # check category levels
    assert df["Day"].isin(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]).all(), \
        "Invalid Day values"
    assert df["Incident"].isin(["Cleaning - Unsanitary", "Collision - TTC", "Mechanical", "Operations - Operator", "Diversion", "Emergency Services", "Utilized Off Route", "Investigation", "Road Blocked - NON-TTC Collision", "Vision", "General Delay", "Security"]).all(), "Invalid Incident values"

    # target variable distribution (Example: ensure Min Delay isn't mostly 0)
    assert (df["Min Delay"] > 0).mean() > 0.05, "Min Delay mostly zero, check for data skewness"

    # correlation checks
    corr_matrix = df.corr(numeric_only=True)
    assert not corr_matrix.isnull().values.any(), "Anomalous correlations detected in numeric features"
    print("All checks passed")


schema.validate(ttc, lazy=True)
additional_validations(ttc)

NameError: name 'ttc' is not defined