[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Praanshu101/Data_Validation/blob/main/Assignment2.ipynb)

### Dataset Preparation

In [43]:
#!wget https://s3.amazonaws.com/tripdata/202412-citibike-tripdata.zip

In [44]:
#import zipfile
#with zipfile.ZipFile('/content/202412-citibike-tripdata.zip', 'r') as zip_ref:
#    zip_ref.extractall('/content')

In [53]:
import pandas as pd
import pandera as pa
import json

import os
import zipfile
import warnings

warnings.filterwarnings('ignore')

In [46]:
# Load the csv files from folder 202412-citibike-tripdata

path = '202412-citibike-tripdata'
files = os.listdir(path)

# Load the csv files into a dataframe
df = pd.concat([pd.read_csv(path + '/' + f) for f in files], ignore_index = True)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B44E5B10AEE58AD0,classic_bike,2024-12-14 10:58:18.153,2024-12-14 11:11:11.308,Frederick Douglass Blvd & W 145 St,7954.12,E 138 St & 5 Ave,7809.13,40.823061,-73.941928,40.81449,-73.936153,member
1,BC252DC6A6011556,electric_bike,2024-12-12 14:46:12.473,2024-12-12 16:45:37.777,Madison Ave & E 99 St,7443.01,,,40.789485,-73.952429,40.78,-73.96,member
2,6FBE55EF6FE8736D,electric_bike,2024-12-11 07:55:18.770,2024-12-11 08:02:23.460,Columbia St & Kane St,4422.05,,,40.687632,-74.001626,40.69,-74.0,member
3,908890DE7FDCF9FE,electric_bike,2024-12-09 22:51:11.668,2024-12-09 22:57:43.495,E 13 St & 2 Ave,5820.08,E 10 St & 2 Ave,5746.02,40.731539,-73.985302,40.729708,-73.986598,member
4,D5D366379A4DC0A8,classic_bike,2024-12-10 18:48:40.063,2024-12-10 19:10:32.264,11 Ave & W 41 St,6726.01,E 25 St & 1 Ave,6004.07,40.760301,-73.998842,40.738177,-73.977387,member


#### Validation of key fields

Validating the following fields (using pandera):
- ride_id (string)
- rideable_type (categorical)
- started_at (string)
- ended_at (string)
- start_station_name (string)
- start_station_id (string)
- end_station_name (string)
- end_station_id (string)
- start_lat, start_lng (float)
- end_lat, end_lng (float)
- member_casual (categorical)

In [47]:
# Validate the datatype of the columns

df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [48]:
# Validate the schema of the dataframe

schema = pa.DataFrameSchema({
    "ride_id": pa.Column(pa.String),
    "rideable_type": pa.Column(pa.String),
    "started_at": pa.Column(pa.String),
    "ended_at": pa.Column(pa.String),
    "start_station_name": pa.Column(pa.String),
    "start_station_id": pa.Column(pa.String),
    "end_station_name": pa.Column(pa.String),
    "end_station_id": pa.Column(pa.String),
    "start_lat": pa.Column(pa.Float),
    "start_lng": pa.Column(pa.Float),
    "end_lat": pa.Column(pa.Float),
    "end_lng": pa.Column(pa.Float),
    "member_casual": pa.Column(pa.String)
})

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

{
  "SCHEMA": {
    "SERIES_CONTAINS_NULLS": [
      {
        "schema": null,
        "column": "start_station_name",
        "check": "not_nullable",
        "error": "non-nullable series 'start_station_name' contains null values:7183      NaN8010      NaN9244      NaN10186     NaN10791     NaN         ... 995036    NaN999616    NaN999641    NaN999668    NaN999688    NaNName: start_station_name, Length: 625, dtype: object"
      },
      {
        "schema": null,
        "column": "start_station_id",
        "check": "not_nullable",
        "error": "non-nullable series 'start_station_id' contains null values:7183      NaN8010      NaN9244      NaN10186     NaN10791     NaN         ... 995036    NaN999616    NaN999641    NaN999668    NaN999688    NaNName: start_station_id, Length: 625, dtype: object"
      },
      {
        "schema": null,
        "column": "end_station_name",
        "check": "not_nullable",
        "error": "non-nullable series 'end_station_name' contains null val

In [49]:
# Convert the start_station_id and end_station_id to string

df['start_station_id'] = df['start_station_id'].astype(str)
df['end_station_id'] = df['end_station_id'].astype(str)

# Handling null values

df.isnull().sum()

# Drop the rows with null values

df = df.dropna()

# Validate the schema of the dataframe

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))
    

In [50]:
# Analyzing each feature and writing appropriate checks

# Rideaable type (only two types of rideable type)
print(df['rideable_type'].value_counts())

# Member_casual (only two types of member_casual)
print(df['member_casual'].value_counts())

# Check if the start_station_id and end_station_id are not same
print(df[df['start_station_id'] == df['end_station_id']].shape[0])

# Check if the start_lat, start_lng and end_lat, end_lng are not same
print(df[(df['start_lat'] == df['end_lat']) & (df['start_lng'] == df['end_lng'])].shape[0])

# Check if the start_time is less than end_time
print(df[df['started_at'] > df['ended_at']].shape[0])

rideable_type
electric_bike    689945
classic_bike     306098
Name: count, dtype: int64
member_casual
member    882038
casual    114005
Name: count, dtype: int64
14470
14565
0


In [51]:
# Analyzing each feature and writing appropriate checks

@pa.check("rideable_type", pa.Check.isin(["docked_bike", "electric_bike"])) # Check if the rideable_type is either docked_bike or electric_bike
@pa.check("member_casual", pa.Check.isin(["member", "casual"])) # Check if the member_casual is either member or casual
@pa.check("start_station_id", "end_station_id", pa.Check.ne) # Check if the start_station_id and end_station_id are not same
@pa.check("start_lat", "start_lng", "end_lat", "end_lng", pa.Check.ne) # Check if the start_lat, start_lng and end_lat, end_lng are not same
@pa.check("started_at", "ended_at", pa.Check.less_than) # Check if the start_time is less than end_time

def validate_schema(df: pd.DataFrame) -> pd.DataFrame:
    return df

# Validate the schema of the dataframe

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))


Question 2

In [9]:
%rm -rf gx
%pip install pandas
%pip install great_expectations -q

UsageError: Line magic function `%rm` not found.


In [9]:
import great_expectations as gx
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.checkpoint import CheckpointResult

import pandas as pd

In [10]:
#1. Create a Data Context
context = gx.get_context(mode="file")

print(type(context).__name__)

FileDataContext


In [11]:
# 2. Set up a Datasource (CSV file example)
data_source_name = "my_data_source"
data_source = context.data_sources.add_pandas(name=data_source_name)


# A dataframe Data Asset is used to group your Validation Results.
# You can think it like a table of a database.
data_asset_name = "my_dataframe_data_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch_parameters = {"dataframe": df}
batch = batch_definition.get_batch(batch_parameters)
print(batch.head(3))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 547.34it/s] 

            ride_id  rideable_type               started_at  \
0  B44E5B10AEE58AD0   classic_bike  2024-12-14 10:58:18.153   
1  BC252DC6A6011556  electric_bike  2024-12-12 14:46:12.473   
2  6FBE55EF6FE8736D  electric_bike  2024-12-11 07:55:18.770   

                  ended_at                  start_station_name  \
0  2024-12-14 11:11:11.308  Frederick Douglass Blvd & W 145 St   
1  2024-12-12 16:45:37.777               Madison Ave & E 99 St   
2  2024-12-11 08:02:23.460               Columbia St & Kane St   

  start_station_id  end_station_name end_station_id  start_lat  start_lng  \
0          7954.12  E 138 St & 5 Ave        7809.13  40.823061 -73.941928   
1          7443.01               NaN            NaN  40.789485 -73.952429   
2          4422.05               NaN            NaN  40.687632 -74.001626   

    end_lat    end_lng member_casual  
0  40.81449 -73.936153        member  
1  40.78000 -73.960000        member  
2  40.69000 -74.000000        member  





In [12]:
# Create an expectation suite
suite_name = "datatype_validation_suite"
suite = gx.ExpectationSuite(name=suite_name)

In [13]:
# context.suites.add(suite)


In [14]:
# Add expectations to the suite
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="ride_id", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="rideable_type", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="started_at", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="ended_at", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="start_station_name", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="start_station_id", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="end_station_name", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="end_station_id", type_="object"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="start_lat", type_="float64"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="start_lng", type_="float64"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="end_lat", type_="float64"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="end_lng", type_="float64"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="member_casual", type_="object"
    )
)





ExpectColumnValuesToBeOfType(id=None, meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='member_casual', mostly=1, row_condition=None, condition_parser=None, type_='object')

In [15]:
definition_name = "my_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [16]:
context.suites.add(suite)

{
  "name": "datatype_validation_suite",
  "id": "34fe9a4c-a7eb-48d0-9f27-11487912e269",
  "expectations": [
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "ride_id",
        "type_": "object"
      },
      "meta": {},
      "id": "16862d53-ae38-4e41-a683-0afe6e9fd37c"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "rideable_type",
        "type_": "object"
      },
      "meta": {},
      "id": "6d48e413-fa98-40bb-869b-a55f400a336a"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "started_at",
        "type_": "object"
      },
      "meta": {},
      "id": "85e46255-3502-4c43-99d8-29ce96cc2665"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "ended_at",
        "type_": "object"
      },
      "meta": {},
      "id": "3f1aab1c-4e7d-4568-8446-1278da05f631"
    },
    {
      "type

In [17]:
validation_definition = context.validation_definitions.add(validation_definition)

In [18]:
validation_definition_name = "my_validation_definition"
validation_definition = context.validation_definitions.get(validation_definition_name)

validation_results = validation_definition.run(batch_parameters=batch_parameters)
print(validation_results)

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 796.34it/s] 

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "batch_id": "my_data_source-my_dataframe_data_asset",
          "column": "ride_id",
          "type_": "object"
        },
        "meta": {},
        "id": "16862d53-ae38-4e41-a683-0afe6e9fd37c"
      },
      "result": {
        "observed_value": "object_"
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "batch_id": "my_data_source-my_dataframe_data_asset",
          "column": "rideable_type",
          "type_": "object"
        },
        "meta": {},
        "id": "6d48e413-fa98-40bb-869b-a55f400a336a"
      },
      "result": {
        "obser




In [19]:
from great_expectations.checkpoint import (
    SlackNotificationAction,
    UpdateDataDocsAction,
    EmailAction
)

In [20]:
context = gx.get_context()

In [21]:
validation_definitions = [
    context.validation_definitions.get("my_validation_definition")
]

In [22]:
email_action =[EmailAction(
    name="send_email_on_failure",
    smtp_address='smtp.gmail.com',
    smtp_port=587,
    receiver_emails="23110277@iitgn.ac.in",
    sender_login="23110249@iitgn.ac.in",
    #sender_password="xmfo fxlf slox pumb",
    sender_password="vgsf mbyj ifkd dbzh",
    use_tls=True,
    notify_on="all",
)]

In [23]:
checkpoint_name = "my_checkpoint"
checkpoint = gx.Checkpoint(
    name=checkpoint_name,
    validation_definitions=validation_definitions,
    actions=email_action,
    result_format={"result_format": "COMPLETE"},
)

# Save the Checkpoint to the Data Context
context.checkpoints.add(checkpoint)

# Retrieve the Checkpoint later
checkpoint_name = "my_checkpoint"
checkpoint = context.checkpoints.get(checkpoint_name)

In [25]:
validation_results = checkpoint.run(
    batch_parameters=batch_parameters, expectation_parameters=suite
)

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 590.58it/s] 
Failed to authenticate to the SMTP server at address: smtp.gmail.com
