Step-by-Step Explanation
    
    # Part A: Working with CSV Files
    CSVs are everywhere in data pipelines. Key challenges:

    Large files (GBs) that don't fit in memory
    Messy data (wrong delimiters, encoding issues, headers)
    Type inference (dates as strings, numbers as text)

    # Part B: Fetching Data from APIs

    APIs return JSON typically. You need to:

    Make HTTP requests
    Parse nested JSON structures
    Convert to DataFrame for manipulation

In [None]:
import pandas as pd
from pathlib import Path

In [None]:

path = Path("../../csv/")

In [None]:
# all things combined

# ENGINEERING APPROACH: Specify parameters for reliability
df = pd.read_csv(path / "user_data.csv",
    sep=',',                    # Explicit delimiter
    encoding='utf-8',           # Handle special characters
    parse_dates=['signup_date'], # Auto-convert date columns
    dtype={'user_id': str},     # Force specific types
    na_values=['NULL', 'N/A']   # Custom null indicators)
)
print(df.head(3))
print(f"\nShape: {df.shape}")  # (rows, columns)
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Universal parsing pattern (memorize this)

    * Load as string

    * Parse with errors="coerce"

    * Compare parsed vs raw

    Classify:

        * Missing

        * Invalid

        * Valid

    This pattern is type-agnostic.

# working on delimiter

<!-- practise data for delimiter -->

user_id,name,signup_date,age
001,John Doe,2024-01-15,29
002,Jane Smith,2024-02-20,34
003,Alice Brown,2024-03-05,27

user_id|name|signup_date|age
001|John Doe|2024-01-15|29
002|Jane Smith|2024-02-20|34
003|Alice Brown|2024-03-05|27


user_id;name;signup_date;age
001;John Doe;2024-01-15;29
002;Jane Smith;2024-02-20;34
003;Alice Brown;2024-03-05;27

In [None]:
df = pd.read_csv(path /'user_data.csv',sep=',')
df

In [None]:
df = pd.read_csv(path /'user_data.csv',sep='|')
df

In [None]:
df = pd.read_csv(path /'user_data.csv',sep=';')
df

# working on encoding

In [None]:
df = pd.read_csv(path /"user_utf_8.csv", encoding="utf-8")
df

In [None]:
df = pd.read_csv(path /"user_latin_1.csv", encoding="utf-8")
df

# working on parse_dates

Missing and invalid dates become indistinguishable (NaT). with parse dates

In [None]:
# here the the dates are not converted to datetime but object
df = pd.read_csv(
    path /"user_dates.csv",
    parse_dates=["signup_date"],
    na_values=["", "N/A"]
)

df['signup_date']

In [None]:
# working on the above problem and converting them explicitly
# donot combine use pd.to_datetime where we explicitly force our data to date
df = pd.read_csv(
    path /"user_dates.csv",
    parse_dates=["signup_date"],
)


df['updated_date'] = pd.to_datetime(
    df['signup_date'],
    errors='coerce'
)
df

In [None]:
df['updated_date'].isna()

In [None]:
#  CLASSIFY THE DATA AS INVALID ETC
is_missing = df["signup_date"].isna()
is_invalid = df['updated_date'].isna() & df["signup_date"].notna()
is_valid = df['updated_date'].notna()


In [None]:
df[is_missing]

In [None]:
df[is_invalid]

In [None]:
df[is_valid]

# working with specific dtypes

✅ IDs / identifiers

d   type={'user_id': str}


    Also applies to:

        order_id

        customer_id

        account_id

        employee_code

    Any column that:

        Looks numeric

        Must preserve leading zeros

        Is never used in arithmetic

    Why:

        Prevents 001 → 1

        Prevents float coercion

        Prevents silent corruption

In [None]:
d_dtype = {
    'user_id': str,
    'zip_code': str,
    'gender': 'category',
    'plan_type': 'category'
}

df = pd.read_csv(path /'user_specific_dtypes.csv', dtype=d_dtype)
df

Rule of thumb: Only convert columns to numeric if you intend to calculate on them. Otherwise, keep as string.

In [None]:
# Normalize numeric and datetime columns
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')

# Separate missing vs invalid dates
missing_dates = df['signup_date'].isna() & df['signup_date'].isna()
invalid_dates = df['signup_date'].notna() & df['signup_date'].isna()

print(df)


# working on na_values

na_values=['NULL', 'N/A']
What it does

When pandas reads a CSV, it normally treats some things as NaN automatically, e.g.:

""  → NaN
"NaN" → NaN


na_values extends this list with custom strings that should also be considered missing.

In your example:

na_values=['NULL', 'N/A']

Key points to remember

Always include all variants of missing data in production pipelines.

Common: "", "N/A", "NULL", "None", "na", "NA", "–"

This is applied before parsing, so it affects:

parse_dates

pd.to_numeric

Missing ≠ invalid

NaN / NaT → missing

Garbage / wrong format → invalid (needs explicit detection)

In [None]:
import pandas as pd
from io import StringIO

data = """
user_id,signup_date,age,salary
001,2024-01-15,29,50000
002,,34,62000
003,2024-02-30,27,45000
004,N/A, ,70000
005,2024-13-01,31,not_available
006,garbage,28,58000
"""

csv = StringIO(data)
df = pd.read_csv(
    csv,
    na_values=['NULL', 'N/A', ' ', 'not_available']
)

print(df)


# LOADING LARGE CHUNK OF DATA

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Constants
num_rows = 50000
cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Mumbai", "Delhi", "London"]
statuses = ["active", "inactive"]

# Helper to generate random date
def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return start + timedelta(days=random_days)

# Generate data
data = {
    "user_id": [f"U{str(i+1).zfill(5)}" for i in range(num_rows)],
    "signup_date": [random_date(datetime(2024,1,1), datetime(2024,12,31)) for _ in range(num_rows)],
    "status": [random.choice(statuses) for _ in range(num_rows)],
    "age": [random.randint(18, 65) if random.random() > 0.05 else "" for _ in range(num_rows)], # 5% missing
    "city": [random.choice(cities) for _ in range(num_rows)],
    "salary": [random.randint(20000, 120000) if random.random() > 0.1 else "N/A" for _ in range(num_rows)] # 10% missing
}

df = pd.DataFrame(data)

# Introduce some invalid dates
for i in range(0, num_rows, 10000):
    df.loc[i, "signup_date"] = "2024-02-30"  # invalid date

# Save to CSV
df.to_csv(path /"large_file.csv", index=False)
print("Large CSV generated: large_file.csv")


In [None]:
data = pd.read_csv(path /"large_file.csv")

In [None]:
# For files that don't fit in memory
chunk_size = 10000
chunks = []

for chunk in pd.read_csv(f'{path}large_file.csv', chunksize=chunk_size):
    # Process each chunk (filter, transform)
    processed = chunk[chunk['status'] == 'active']
    chunks.append(processed)

# Combine all chunks
df_final = pd.concat(chunks, ignore_index=True)
print(f"Total rows after filtering: {len(df_final)}")