In [0]:
import pandas as pd
from pyspark.sql.functions import countDistinct,coalesce,col,lit,abs,unix_timestamp

Reading the table into a dataframe

In [0]:
df=spark.read.table('nyc_taxi.bronze.hvfhv_trips')

Profiling

In [0]:
df.printSchema()

In [0]:
#invalid rows in miles,time,passenger fare,driver_pay
display(df.summary())

Null Column Check

In [0]:
null_dict={}

for col in df.columns:
  null_dict[col]= df.filter(df[col].isNull()).count()

null_df=pd.DataFrame.from_dict(data=null_dict,orient='index',columns=['null_count'])
null_df.sort_values(by='null_count',ascending=False,inplace=True)

> A group of columns are always null together

In [0]:
#cbd_congestion_fee bas high amount of nulls
#cbd_congestion_fee has high amount of nulls due to them being collected later on
# nulls in on scene date time and originating_base_num
print(null_df)

Null Column Investigation

In [0]:
%sql
--most rides are associated with Lyft HV0005
select hvfhs_license_num,year,month,count(*) from nyc_taxi.bronze.hvfhv_trips
where originating_base_num is null or on_scene_datetime is null
group by hvfhs_license_num, year,month
order by count(*) desc

In [0]:
%sql
-- most Lyft rides don't have an originating base num
select dispatching_base_num,originating_base_num, count(*) from nyc_taxi.bronze.hvfhv_trips where hvfhs_license_num = 'HV0005' 
group by dispatching_base_num,originating_base_num

Duplicate Record Check

In [0]:
%sql
--no duplicates detected
WITH raw_cte AS (
  SELECT * EXCEPT (run_id,_source_file,_ingest_ts,month,year,_rescued_data)
  FROM nyc_taxi.bronze.hvfhv_trips
)
SELECT
  count(*) AS total_rows,
  count(DISTINCT *) AS distinct_rows_allcols,
  count(DISTINCT to_json(struct(*))) AS distinct_rows_json
FROM raw_cte;


 date checks

In [0]:
# checking for rows where dropoff time < pickup time
date_check_df=df.filter(col('dropoff_datetime') < col('pickup_datetime'))
display(date_check_df)
print(date_check_df.count())

In [0]:
date_check_df=df.filter(unix_timestamp(col('on_scene_datetime')) - unix_timestamp(col('pickup_datetime')) >60 )
display(date_check_df)
print(date_check_df.count())

In [0]:
# df.summary was sufficient for spotting outlier and other columns that don't meet business rules