In [0]:
import pandas as pd
from pyspark.sql.functions import countDistinct,coalesce,col,lit,abs,unix_timestamp

Reading the table into a dataframe

In [0]:
df=spark.read.table('nyc_taxi.bronze.green_taxi_trips')

Profiling

In [0]:
df.printSchema()

In [0]:
#passenger count,trip distance is 0 in some rows, charge and fare rows have negative amounts,outlier rows exist
display(df.summary())

Null Column Check

In [0]:
null_dict={}

for col in df.columns:
  null_dict[col]= df.filter(df[col].isNull()).count()

null_df=pd.DataFrame.from_dict(data=null_dict,orient='index',columns=['null_count'])
null_df.sort_values(by='null_count',ascending=False,inplace=True)

In [0]:
#ehail_fee completely null, will be dropped
#store_and_fwd_flag,RatecodeID,passenger_count,congestion_surcharge,payment_type share same amount of nulls, could be related
#cbd_congestion_fee has high amount of nulls due to them being collected later on
print(null_df)

Null Column Investigation

In [0]:
%sql
--no clear association
select VendorID,trip_type,year,month,count(*) from nyc_taxi.bronze.green_taxi_trips
where RatecodeID is null or store_and_fwd_flag is null or passenger_count is null or congestion_surcharge is null or payment_type is null
group by VendorID,trip_type,year,month
order by vendorId, year,month ,count(*) desc

Duplicate Record Check

In [0]:
%sql
--no duplicates detected
WITH raw_cte AS (
  SELECT * EXCEPT (run_id,_source_file,_ingest_ts,month,year,_rescued_data)
  FROM nyc_taxi.bronze.green_taxi_trips
)
SELECT
  count(*) AS total_rows,
  count(DISTINCT *) AS distinct_rows_allcols,
  count(DISTINCT to_json(struct(*))) AS distinct_rows_json
FROM raw_cte;


Pickup/dropoff date check

In [0]:
# checking for rows where dropoff time < pickup time
date_check_df=df.filter(col('lpep_dropoff_datetime') < col('lpep_pickup_datetime'))
display(date_check_df)


In [0]:
# df.summary was sufficient for spotting outlier and other columns that don't meet business rules