# Extra Validation 1: Casual Only Validation
This notebook performs validation checks on the 'Casual Only' dataset to ensure data quality and correctness.

In [1]:
import pandas as pd
from pathlib import Path
import sys

# -------------------------------
# 1. PATH SETUP 
# -------------------------------
# This notebook is in: DIVVY PROJECT/Scripts/Extra Validation/
SCRIPT_DIR = Path().resolve()

# Assuming notebook is running in the 'Extra Validation' folder
PROJECT_ROOT = SCRIPT_DIR.parent.parent 

file_path = PROJECT_ROOT / 'Data' / 'Processed Datasets' / 'cyclistic_casual_only.csv'

In [2]:
# -------------------------------
# 2. LOAD DATA
# -------------------------------
print(f"Searching for file at: {file_path}")

if not file_path.exists():
    print("ERROR: File not found!")
    print(f"Current Root: {PROJECT_ROOT}")
    print("Check if your Data folder is in the root and not inside Scripts.")
    # sys.exit(1) # Commented out for notebook
else:
    print("Loading casual-only dataset...")
    # Using usecols to save memory
    df = pd.read_csv(file_path, usecols=['ride_id', 'start_station_name', 'hour', 'day_of_week', 'month'])
    print(f"Dataset loaded. Total rows: {len(df):,}\n")

Searching for file at: C:\Users\Siddharth\Desktop\DIVVY PROJECT\Data\Processed Datasets\cyclistic_casual_only.csv
Loading casual-only dataset...
Dataset loaded. Total rows: 2,037,864



In [3]:
# -------------------------------
# 3. VALIDATION LOGIC
# -------------------------------
if file_path.exists():
    print("--- Dataset Info & Missing Values ---")
    print(df.info())
    print("\nMissing values per column:")
    print(df.isna().sum())

    print("\n--- Unique Values & Ranges ---")
    print("Unique values in 'day_of_week':", df['day_of_week'].unique())
    print("Hour range:", df['hour'].min(), "to", df['hour'].max())
    print("Number of unique start stations:", df['start_station_name'].nunique())
    print("Months present in data:", df['month'].unique())

    print("\n--- Quick Statistics ---")
    print(df['hour'].describe())

    print("\n--- Sample Data ---")
    print(df.head())

--- Dataset Info & Missing Values ---
<class 'pandas.DataFrame'>
RangeIndex: 2037864 entries, 0 to 2037863
Data columns (total 5 columns):
 #   Column              Dtype
---  ------              -----
 0   ride_id             str  
 1   start_station_name  str  
 2   hour                int64
 3   day_of_week         str  
 4   month               str  
dtypes: int64(1), str(4)
memory usage: 77.7 MB
None

Missing values per column:
ride_id                    0
start_station_name    438506
hour                       0
day_of_week                0
month                      0
dtype: int64

--- Unique Values & Ranges ---
Unique values in 'day_of_week': <StringArray>
['Thursday', 'Wednesday', 'Tuesday', 'Sunday', 'Saturday', 'Monday', 'Friday']
Length: 7, dtype: str
Hour range: 0 to 23
Number of unique start stations: 1706
Months present in data: <StringArray>
[ 'December',  'November',   'January',  'February',     'March',     'April',
       'May',      'June',      'July',    'August',