In [None]:
import pandas as pd
import numpy as np

In [None]:
#  example
#  wrong letting pandas guess
from numpy import dtype
from pandas._typing import DtypeArg


df = pd.read_csv("dates_test.csv")

# you provide explicit dtypes
dtype_spec:DtypeArg = {
    'user_id': 'Int64',
    'user_name': 'string',
    'age': 'Int64',
    'salary': 'Float64',
    'is_active': 'boolean'
}

df = pd.read_csv('dates_test.csv',dtype=dtype_spec)

In [None]:
# Example: Without dtype specification
# CSV contains: user_id,score
#               001,95
#               002,88

df_inferred = pd.read_csv('scores.csv')
print(df_inferred.dtypes)
# user_id: int64  (leading zeros lost!)
# score: int64
# # With explicit dtypes
df_explicit = pd.read_csv('scores.csv', dtype={'user_id': 'string', 'score': 'int64'})
print(df_explicit['user_id'])
df_explicit
# # 0    001
# # 1    002
# # Preserves leading zeros


In [None]:

#  this is massive issue
# object = “I cannot represent this data efficiently, so I’ll store generic Python objects.”
pd.Series([True, False, None])

# use this
# pd.Series([True, False, None], dtype="boolean")


In [None]:
#  well. this None is not numpy thing as there not thing ie to store a missing value so even if you use numpy.nan the issue will be the same
# from numpy import int64


pd.Series([True, False, np.nan])
#  this will still fall back to object bcz
# It still falls back to object because NumPy’s boolean dtype (np.bool_) cannot represent NaN, even though np.nan itself comes from NumPy.
#  distinction is Only floating-point NumPy dtypes can represent NaN.
# pd.Series([1.0,2.0,0.3,None])

#  checking for int array
pd.Series([1,2,3,None]) # pandas will upgrade this so it could fit in None so it will become float array



In [None]:
# <!--  working with datetime  -->

# The actual NumPy-backed dtype pandas uses to store timestamps -> dtype="datetime64[ns]"
pd.Series(
    ["2024-01-01", "2024-01-02", None],
    dtype="datetime64[ns]"
)

In [None]:
#  if you donot provide dtype this time column whould be assumed to be a object
print(pd.Series(["a", "b", None], dtype="string"))

print("---------------------------")

#  without string this is object
print(pd.Series(["a", "b", None]))

In [None]:
from datetime import datetime

def parse_date(x):
    for fmt in ("%Y-%m-%d", "%d/%m/%Y"):
        try:
            return datetime.strptime(x, fmt)
        except:
            continue
    return pd.NaT



In [None]:
from dateutil import parser

def parse_date(x):
    try:
        return parser.parse(str(x), dayfirst=True)
    except:
        return pd.NaT


In [None]:
df = pd.read_csv("dates_test.csv",parse_dates=['date'])
# df['date_parsed'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')
df['date_parsed'] = df['date'].apply(parse_date) # type: ignore
df


In [None]:
pd.Categorical(
    #  this is data ie the input
    ["train", "test", "dog"],
    #  these are the stored values which are checked for
    categories=["train", "test"]
)
# "dog" → NaN

In [None]:
pd.Categorical(
    ["low", "medium", "high"],
    categories=["low", "medium", "high"],
    ordered=True
)

In [None]:
# ordered vs unordered categories
pd.Categorical(
    ["low", "medium", "high"],
    categories=["low", "medium", "high"],
    ordered=True
)


In [None]:
s = pd.Series(
    ["low", "high",'medium'],
    dtype=pd.CategoricalDtype(
        categories=["low", "medium", "high"],
        ordered=True
    )
)

# Use this for:

# filtering

# conditions

# business rules

s[s < "high"]

In [None]:
s = pd.Series(
    ["low", "high"],
    dtype=pd.CategoricalDtype(
        categories=["low", "medium", "high"],
        ordered=False
    )
)

s

In [None]:
# using category type for quickl status check

# Define allowed statuses
status_dtype = pd.CategoricalDtype(
    categories=["train"],  # allowed values
    ordered=False
)

# Read CSV and enforce dtype
df = pd.read_csv("./category_demodata_check.csv", dtype={"status": status_dtype})
df_filtered = df[df["status"].notna()]
df_filtered
# This will raise an error if any value in 'status' is not in ["train", "test"]


In [None]:
# Define allowed categories
status_dtype = pd.CategoricalDtype(
    categories=['active', 'inactive', 'suspended'],
    ordered=False
)

# This will raise an error if CSV contains invalid statuses
try:
    df = pd.read_csv('users.csv', dtype={'status': status_dtype})
except ValueError as e:
    print(f"Invalid category found: {e}")
df


In [None]:
# Define the categorical dtype
status_dtype = pd.CategoricalDtype(
    categories=['active', 'inactive', 'suspended'],
    ordered=False
)

# Read as string first to check for invalid values
df = pd.read_csv('users.csv', dtype={'status': str})

# Validate before converting
invalid_statuses = df[~df['status'].isin(status_dtype.categories)]

if not invalid_statuses.empty:
    print("Found invalid statuses:")
    print(invalid_statuses[['user_id', 'username', 'status']])
    print("\nOptions:")
    print("1. Fix the CSV and reload")
    print("2. Replace invalid values with a default")
    print("3. Remove invalid rows")
else:
    # Convert to categorical
    df['status'] = df['status'].astype(status_dtype)
    print("✓ All statuses valid and converted to categorical")
    print(f"\nMemory usage: {df['status'].memory_usage(deep=True)} bytes")
    print(df.dtypes)

In [None]:
#  ordered categories

# For rankings, ratings, etc.
priority_dtype = pd.CategoricalDtype(
    categories=['low', 'medium', 'high', 'critical'],
    ordered=True
)
df = pd.read_csv("./task.csv")
df['priority'] = df['priority'].astype(priority_dtype)
# # Now you can compare
print(df[df['priority'] > 'medium'])  # Returns high and critical


In [None]:
df_object = pd.DataFrame({'status': ['active'] * 1_000_000})
df_category = pd.DataFrame({'status': pd.Categorical(['active'] * 1_000_000)})

print(f"Object dtype: {df_object.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Category dtype: {df_category.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
s = pd.Categorical(
    ["train", "test", "dog"],
    categories=["train", "test"]
)
s

###  SCHEMA VALIDATION

In [None]:
#  USING ASTYPE
#  example donot run
df = df.astype({
    "id": "int64",
    "price": "float64",
    "created_at": "datetime64[ns]"
})


In [None]:
df = pd.DataFrame({
    "age": [25, None, 30],
    "active": [True, False, None],
    "name": ["Alice", None, "Bob"]
})

print(df.dtypes)

In [None]:
df = df.convert_dtypes()
print(df.dtypes)

In [None]:
# Example DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'salary': [50000.50, 60000.00, 75000.75],
    'is_manager': [True, False, True],
    'hire_date': pd.to_datetime(['2023-01-15', '2022-06-01', '2024-03-10']),
    'notes': ['Senior', 'Mid', np.nan]
})

print(df.dtypes)

In [None]:
df.info()

In [None]:
from pandas.api.types import (
    is_integer_dtype,
    is_float_dtype,
    is_numeric_dtype,
    is_string_dtype,
    is_object_dtype,
    is_bool_dtype,
    is_datetime64_any_dtype
)

# Check a single column
print(is_numeric_dtype(df['age']))          # True
print(is_string_dtype(df['name']))          # True (includes object with strings)
print(is_datetime64_any_dtype(df['hire_date']))  # True

# Find all numeric columns
numeric_cols = df.select_dtypes(include='number').columns
print("Numeric columns:", numeric_cols)

# Find all object/string columns
string_cols = df.select_dtypes(include=['object', 'string']).columns
print("String/object columns:", string_cols)