[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Praanshu101/Data_Validation/blob/main/Assignment2.ipynb)

### Dataset Preparation

In [1]:
#!wget https://s3.amazonaws.com/tripdata/202412-citibike-tripdata.zip

In [2]:
#import zipfile
#with zipfile.ZipFile('/content/202412-citibike-tripdata.zip', 'r') as zip_ref:
#    zip_ref.extractall('/content')

In [3]:
import pandas as pd
import pandera as pa
import json

import os
import zipfile
import warnings

warnings.filterwarnings('ignore')

In [4]:
# Load the csv files from folder 202412-citibike-tripdata

path = '202412-citibike-tripdata'
files = os.listdir(path)

# Load the csv files into a dataframe
df = pd.concat([pd.read_csv(path + '/' + f) for f in files], ignore_index = True)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B44E5B10AEE58AD0,classic_bike,2024-12-14 10:58:18.153,2024-12-14 11:11:11.308,Frederick Douglass Blvd & W 145 St,7954.12,E 138 St & 5 Ave,7809.13,40.823061,-73.941928,40.81449,-73.936153,member
1,BC252DC6A6011556,electric_bike,2024-12-12 14:46:12.473,2024-12-12 16:45:37.777,Madison Ave & E 99 St,7443.01,,,40.789485,-73.952429,40.78,-73.96,member
2,6FBE55EF6FE8736D,electric_bike,2024-12-11 07:55:18.770,2024-12-11 08:02:23.460,Columbia St & Kane St,4422.05,,,40.687632,-74.001626,40.69,-74.0,member
3,908890DE7FDCF9FE,electric_bike,2024-12-09 22:51:11.668,2024-12-09 22:57:43.495,E 13 St & 2 Ave,5820.08,E 10 St & 2 Ave,5746.02,40.731539,-73.985302,40.729708,-73.986598,member
4,D5D366379A4DC0A8,classic_bike,2024-12-10 18:48:40.063,2024-12-10 19:10:32.264,11 Ave & W 41 St,6726.01,E 25 St & 1 Ave,6004.07,40.760301,-73.998842,40.738177,-73.977387,member


#### Validation of key fields

Validating the following fields (using pandera):
- ride_id (string)
- rideable_type (categorical)
- started_at (string)
- ended_at (string)
- start_station_name (string)
- start_station_id (string)
- end_station_name (string)
- end_station_id (string)
- start_lat, start_lng (float)
- end_lat, end_lng (float)
- member_casual (categorical)

In [5]:
# Validate the datatype of the columns

df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [6]:
# Validate the schema of the dataframe

schema = pa.DataFrameSchema({
    "ride_id": pa.Column(pa.String),
    "rideable_type": pa.Column(pa.String),
    "started_at": pa.Column(pa.String),
    "ended_at": pa.Column(pa.String),
    "start_station_name": pa.Column(pa.String),
    "start_station_id": pa.Column(pa.String),
    "end_station_name": pa.Column(pa.String),
    "end_station_id": pa.Column(pa.String),
    "start_lat": pa.Column(pa.Float),
    "start_lng": pa.Column(pa.Float),
    "end_lat": pa.Column(pa.Float),
    "end_lng": pa.Column(pa.Float),
    "member_casual": pa.Column(pa.String)
})

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

{
  "SCHEMA": {
    "SERIES_CONTAINS_NULLS": [
      {
        "schema": null,
        "column": "start_station_name",
        "check": "not_nullable",
        "error": "non-nullable series 'start_station_name' contains null values:7183      NaN8010      NaN9244      NaN10186     NaN10791     NaN         ... 995036    NaN999616    NaN999641    NaN999668    NaN999688    NaNName: start_station_name, Length: 625, dtype: object"
      },
      {
        "schema": null,
        "column": "start_station_id",
        "check": "not_nullable",
        "error": "non-nullable series 'start_station_id' contains null values:7183      NaN8010      NaN9244      NaN10186     NaN10791     NaN         ... 995036    NaN999616    NaN999641    NaN999668    NaN999688    NaNName: start_station_id, Length: 625, dtype: object"
      },
      {
        "schema": null,
        "column": "end_station_name",
        "check": "not_nullable",
        "error": "non-nullable series 'end_station_name' contains null val

In [7]:
# Analyzing each feature and writing appropriate checks

#Rideaable type



Question 2

In [8]:
%rm -rf gx
%pip install pandas
%pip install great_expectations -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [9]:
import great_expectations as gx
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.checkpoint import CheckpointResult

import pandas as pd

In [10]:
#1. Create a Data Context
context = gx.get_context(mode="file")

print(type(context).__name__)

FileDataContext


In [14]:
# 2. Set up a Datasource (CSV file example)
data_source_name = "my_data_source"
data_source = context.data_sources.add_pandas(name=data_source_name)


# A dataframe Data Asset is used to group your Validation Results.
# You can think it like a table of a database.
data_asset_name = "my_dataframe_data_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch_parameters = {"dataframe": df}
batch = batch_definition.get_batch(batch_parameters)
print(batch.head(3))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 573.31it/s] 

            ride_id  rideable_type               started_at  \
0  B44E5B10AEE58AD0   classic_bike  2024-12-14 10:58:18.153   
1  BC252DC6A6011556  electric_bike  2024-12-12 14:46:12.473   
2  6FBE55EF6FE8736D  electric_bike  2024-12-11 07:55:18.770   

                  ended_at                  start_station_name  \
0  2024-12-14 11:11:11.308  Frederick Douglass Blvd & W 145 St   
1  2024-12-12 16:45:37.777               Madison Ave & E 99 St   
2  2024-12-11 08:02:23.460               Columbia St & Kane St   

  start_station_id  end_station_name end_station_id  start_lat  start_lng  \
0          7954.12  E 138 St & 5 Ave        7809.13  40.823061 -73.941928   
1          7443.01               NaN            NaN  40.789485 -73.952429   
2          4422.05               NaN            NaN  40.687632 -74.001626   

    end_lat    end_lng member_casual  
0  40.81449 -73.936153        member  
1  40.78000 -73.960000        member  
2  40.69000 -74.000000        member  





In [None]:
expectations = []
for column, dtype in df.dtypes.items():
    expectation = gx.expectations.ExpectColumnValuesToBeOfType(
        column=column, type_=dtype.name
    )
    expectations.append(expectation)

validation_results = []
for expectation in expectations:
    validation_result = batch.validate(expectation)
    validation_results.append(validation_result)



Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 902.58it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 971.80it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 904.72it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 979.52it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 940.85it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 863.56it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 994.85it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 893.74it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 912.40it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 905.70it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 894.31it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 848.02it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 898.52it/s] 

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "batch_id": "my_data_source-my_dataframe_data_asset",
      "column": "ride_id",
      "type_": "object"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "object_"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "batch_id": "my_data_source-my_dataframe_data_asset",
      "column": "rideable_type",
      "type_": "object"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "object_"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
     




In [21]:
# Print the validation results
for result in validation_results:
    print(result)

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "batch_id": "my_data_source-my_dataframe_data_asset",
      "column": "ride_id",
      "type_": "object"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "object_"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "batch_id": "my_data_source-my_dataframe_data_asset",
      "column": "rideable_type",
      "type_": "object"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "object_"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
     