[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Praanshu101/Data_Validation/blob/main/Assignment2.ipynb)

### Dataset Preparation

In [1]:
#!wget https://s3.amazonaws.com/tripdata/202412-citibike-tripdata.zip

In [2]:
#import zipfile
#with zipfile.ZipFile('/content/202412-citibike-tripdata.zip', 'r') as zip_ref:
#    zip_ref.extractall('/content')

In [9]:
import pandas as pd
import pandera as pa
import json

import os
import zipfile
import warnings

warnings.filterwarnings('ignore')

In [12]:
# Load the csv files from folder 202412-citibike-tripdata

path = '202412-citibike-tripdata'
files = os.listdir(path)

# Load the csv files into a dataframe
df = pd.concat([pd.read_csv(path + '/' + f) for f in files], ignore_index = True)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B44E5B10AEE58AD0,classic_bike,2024-12-14 10:58:18.153,2024-12-14 11:11:11.308,Frederick Douglass Blvd & W 145 St,7954.12,E 138 St & 5 Ave,7809.13,40.823061,-73.941928,40.81449,-73.936153,member
1,BC252DC6A6011556,electric_bike,2024-12-12 14:46:12.473,2024-12-12 16:45:37.777,Madison Ave & E 99 St,7443.01,,,40.789485,-73.952429,40.78,-73.96,member
2,6FBE55EF6FE8736D,electric_bike,2024-12-11 07:55:18.770,2024-12-11 08:02:23.460,Columbia St & Kane St,4422.05,,,40.687632,-74.001626,40.69,-74.0,member
3,908890DE7FDCF9FE,electric_bike,2024-12-09 22:51:11.668,2024-12-09 22:57:43.495,E 13 St & 2 Ave,5820.08,E 10 St & 2 Ave,5746.02,40.731539,-73.985302,40.729708,-73.986598,member
4,D5D366379A4DC0A8,classic_bike,2024-12-10 18:48:40.063,2024-12-10 19:10:32.264,11 Ave & W 41 St,6726.01,E 25 St & 1 Ave,6004.07,40.760301,-73.998842,40.738177,-73.977387,member


#### Validation of key fields

Validating the following fields (using pandera):
- ride_id (string)
- rideable_type (categorical)
- started_at (string)
- ended_at (string)
- start_station_name (string)
- start_station_id (string)
- end_station_name (string)
- end_station_id (string)
- start_lat, start_lng (float)
- end_lat, end_lng (float)
- member_casual (categorical)

In [13]:
# Validate the datatype of the columns

df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [14]:
# Convert data types of columns
#df['started_at'] = pd.to_datetime(df['started_at'])
#df['ended_at'] = pd.to_datetime(df['ended_at'])

# Handlinng Null values:
#if 'start_station_name' in df.columns:
#    df['start_station_name'].fillna('Unknown', inplace=True)

In [17]:
# Validate the schema of the dataframe

schema = pa.DataFrameSchema({
    "ride_id": pa.Column(pa.String),
    "rideable_type": pa.Column(pa.Category),
    "started_at": pa.Column(pa.String),
    "ended_at": pa.Column(pa.String),
    "start_station_name": pa.Column(pa.String),
    "start_station_id": pa.Column(pa.String),
    "end_station_name": pa.Column(pa.String),
    "end_station_id": pa.Column(pa.String),
    "start_lat": pa.Column(pa.Float),
    "start_lng": pa.Column(pa.Float),
    "end_lat": pa.Column(pa.Float),
    "end_lng": pa.Column(pa.Float),
    "member_casual": pa.Column(pa.Category)
})

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

{
  "SCHEMA": {
    "WRONG_DATATYPE": [
      {
        "schema": null,
        "column": "rideable_type",
        "check": "dtype('category')",
        "error": "expected series 'rideable_type' to have type category, got object"
      },
      {
        "schema": null,
        "column": "start_station_id",
        "check": "dtype('str')",
        "error": "expected series 'start_station_id' to have type str:failure cases:        index failure_case0      983040      6257.061      983041      6667.042      983042      6847.023      983043      3919.074      983044      3919.07...       ...          ...16949  999995      6248.0816950  999996      5422.0916951  999997      7655.2216952  999998      5578.0216953  999999      8277.03[16954 rows x 2 columns]"
      },
      {
        "schema": null,
        "column": "end_station_id",
        "check": "dtype('str')",
        "error": "expected series 'end_station_id' to have type str:failure cases:         index failure_case0       262144   