Imports and engine

In [2]:
!pip install pandas sqlalchemy matplotlib

Collecting sqlalchemy
  Obtaining dependency information for sqlalchemy from https://files.pythonhosted.org/packages/69/ac/b42ad16800d0885105b59380ad69aad0cce5a65276e269ce2729a2343b6a/sqlalchemy-2.0.46-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading sqlalchemy-2.0.46-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.5 kB)
Downloading sqlalchemy-2.0.46-cp311-cp311-macosx_11_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sqlalchemy
Successfully installed sqlalchemy-2.0.46

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
from pathlib import Path
from sqlalchemy import create_engine
import pandas as pd

# If the notebook lives in etl-ai-schema/notebooks, CWD is that folder:
# project_root = parent of the notebooks directory.
project_root = Path.cwd().parent   # should be .../etl-ai-schema
db_path = project_root / "data" / "database.db"

print("Project root:", project_root)
print("DB path:", db_path)

engine = create_engine(f"sqlite:///{db_path}")
engine

Project root: /Users/rajakarthikchirumamilla/Documents/ThesisWork/etl-ai-schema
DB path: /Users/rajakarthikchirumamilla/Documents/ThesisWork/etl-ai-schema/data/database.db


Engine(sqlite:////Users/rajakarthikchirumamilla/Documents/ThesisWork/etl-ai-schema/data/database.db)

List tables

In [13]:
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", engine)
tables

Unnamed: 0,name
0,yellow_trips_v1
1,yellow_trips_v2


Load V1 and V2

In [14]:
df_v1 = pd.read_sql("SELECT * FROM yellow_trips_v1", engine)
df_v2 = pd.read_sql("SELECT * FROM yellow_trips_v2", engine)

len(df_v1), len(df_v2), df_v1.columns, df_v2.columns

(99376,
 99376,
 Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
        'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
        'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'airport_fee',
        'trip_duration_minutes', 'trip_revenue'],
       dtype='str'),
 Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
        'passenger_count', 'trip_km', 'RatecodeID', 'store_and_fwd_flag',
        'PULocationID', 'DOLocationID', 'fare_amount', 'extra', 'mta_tax',
        'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
        'congestion_surcharge', 'airport_fee', 'tip_ratio',
        'trip_duration_minutes', 'trip_revenue'],
       dtype='str'))

Focus on key metrics

In [15]:
print("V1 columns:", df_v1.columns.tolist())
print("V2 columns:", df_v2.columns.tolist())

V1 columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration_minutes', 'trip_revenue']
V2 columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_km', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'tip_ratio', 'trip_duration_minutes', 'trip_revenue']


In [16]:
# Align distance column names for comparison
df_v1_common = df_v1.copy()
df_v2_common = df_v2.copy()

# In V2, copy trip_km into a column named trip_distance so we can use same metric name
df_v2_common["trip_distance"] = df_v2_common["trip_km"]

metrics = ["fare_amount", "trip_distance"]

df_v1_metrics = df_v1_common[metrics].copy()
df_v2_metrics = df_v2_common[metrics].copy()

print("=== V1 (baseline) ===")
display(df_v1_metrics.describe())

print("=== V2 (after evolution, AI-assisted) ===")
display(df_v2_metrics.describe())

=== V1 (baseline) ===


Unnamed: 0,fare_amount,trip_distance
count,99376.0,99376.0
mean,11.858665,2.798451
std,9.851564,3.407493
min,0.01,0.01
25%,6.5,1.0
50%,9.0,1.68
75%,13.5,3.0
max,285.0,71.9


=== V2 (after evolution, AI-assisted) ===


Unnamed: 0,fare_amount,trip_distance
count,99376.0,99376.0
mean,11.858665,2.798451
std,9.851564,3.407493
min,0.01,0.01
25%,6.5,1.0
50%,9.0,1.68
75%,13.5,3.0
max,285.0,71.9
