# Notebook 02: Glue & Athena Data Catalog


In [1]:
import os
import sys
import time

import boto3
import pandas as pd
from botocore.exceptions import ClientError

In [2]:
# Add project root to path
sys.path.append("..")

#load config
from config.config import (
    BUCKET_NAME,
    AWS_REGION,
    S3_PREFIX,
    RAW_DATA_FILENAME,
    GLUE_DATABASE_NAME,
    GLUE_CRAWLER_NAME,
    get_s3_uri,
)

print(f"Bucket: {BUCKET_NAME}")
print(f"Region: {AWS_REGION}")
print(f"Glue Database: {GLUE_DATABASE_NAME}")
print(f"Glue Crawler: {GLUE_CRAWLER_NAME}")

Bucket: nfci-forecasting-306617143793
Region: us-east-1
Glue Database: nfci_database
Glue Crawler: nfci-raw-data-crawler


### create glue and athena AWS clients

In [3]:
# Create clients
glue_client = boto3.client("glue", region_name=AWS_REGION)
athena_client = boto3.client("athena", region_name=AWS_REGION)

print("Glue client created")
print("Athena client created")

Glue client created
Athena client created


### create glue database

In [4]:
try:
    glue_client.create_database(
        DatabaseInput={
            "Name": GLUE_DATABASE_NAME,
            "Description": "Database for NFCI Forecasting Project",
        }
    )
    print(f"Database '{GLUE_DATABASE_NAME}' created successfully")
except ClientError as e:
    if e.response["Error"]["Code"] == "AlreadyExistsException":
        print(f"Database '{GLUE_DATABASE_NAME}' already exists")
    else:
        print(f"Error creating database: {e}")

Database 'nfci_database' created successfully


### Create Glue Crawler
- create glue IAM role and policy
- 

In [5]:
import json

# IAM client
iam_client = boto3.client("iam")

# Role name for Glue
GLUE_ROLE_NAME = "nfci-glue-crawler-role"

In [6]:
def create_glue_role(role_name: str, bucket_name: str) -> str:
    """
    Create an IAM role for Glue Crawler with necessary permissions.
    
    Returns the role ARN.
    """
    
    # Trust policy - allows Glue service to assume this role
    trust_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Principal": {
                    "Service": "glue.amazonaws.com"
                },
                "Action": "sts:AssumeRole"
            }
        ]
    }
    
    # S3 policy - allows access to our specific bucket
    s3_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject",
                    "s3:PutObject",
                    "s3:ListBucket"
                ],
                "Resource": [
                    f"arn:aws:s3:::{bucket_name}",
                    f"arn:aws:s3:::{bucket_name}/*"
                ]
            }
        ]
    }
    
    try:
        # Step 1: Create the role
        response = iam_client.create_role(
            RoleName=role_name,
            AssumeRolePolicyDocument=json.dumps(trust_policy),
            Description="IAM role for Glue Crawler - NFCI project"
        )
        role_arn = response["Role"]["Arn"]
        print(f"✓ Created role: {role_name}")
        
    except ClientError as e:
        if e.response["Error"]["Code"] == "EntityAlreadyExists":
            # Role exists, get its ARN
            response = iam_client.get_role(RoleName=role_name)
            role_arn = response["Role"]["Arn"]
            print(f"✓ Role already exists: {role_name}")
        else:
            raise e
    
    # Attach the AWS managed Glue service policy
    try:
        iam_client.attach_role_policy(
            RoleName=role_name,
            PolicyArn="arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
        )
        print(f"Attached AWSGlueServiceRole policy")
    except ClientError as e:
        if "already" not in str(e).lower():
            print(f"  Note: {e}")
    
    # Step 3: Create and attach custom S3 policy for our bucket
    s3_policy_name = f"{role_name}-s3-policy"
    
    try:
        iam_client.put_role_policy(
            RoleName=role_name,
            PolicyName=s3_policy_name,
            PolicyDocument=json.dumps(s3_policy)
        )
        print(f"Attached S3 access policy for bucket: {bucket_name}")
    except ClientError as e:
        print(f"  Note: {e}")
    
    return role_arn

In [7]:
# Create the Glue role
GLUE_ROLE_ARN = create_glue_role(GLUE_ROLE_NAME, BUCKET_NAME)
print(f"\nGlue Role ARN: {GLUE_ROLE_ARN}")

✓ Created role: nfci-glue-crawler-role
Attached AWSGlueServiceRole policy
Attached S3 access policy for bucket: nfci-forecasting-306617143793

Glue Role ARN: arn:aws:iam::306617143793:role/nfci-glue-crawler-role


In [8]:
def create_glue_crawler(
    crawler_name: str,
    role_arn: str,
    database_name: str,
    s3_target_path: str,
    table_prefix: str = ""
) -> bool:
    """
    Create a Glue Crawler to catalog data in S3.
    
    Parameters:
        crawler_name: Name for the crawler
        role_arn: IAM role ARN with Glue and S3 permissions
        database_name: Target Glue database for discovered tables
        s3_target_path: S3 path to crawl (e.g., s3://bucket/prefix/)
        table_prefix: Optional prefix for table names
    """
    try:
        glue_client.create_crawler(
            Name=crawler_name,
            Role=role_arn,
            DatabaseName=database_name,
            Description=f"Crawler for {s3_target_path}",
            Targets={
                "S3Targets": [
                    {
                        "Path": s3_target_path,
                        "Exclusions": []  # No exclusions
                    }
                ]
            },
            TablePrefix=table_prefix,
            SchemaChangePolicy={
                "UpdateBehavior": "UPDATE_IN_DATABASE",  # Update existing tables
                "DeleteBehavior": "LOG"  # Log deleted objects, don't remove from catalog
            },
            RecrawlPolicy={
                "RecrawlBehavior": "CRAWL_EVERYTHING"  # Re-crawl all data each time
            },
            Configuration='{"Version":1.0,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"}}}'
        )
        print(f"✓ Crawler '{crawler_name}' created successfully")
        return True
        
    except ClientError as e:
        if e.response["Error"]["Code"] == "AlreadyExistsException":
            print(f"Crawler '{crawler_name}' already exists")
            return True
        else:
            print(f"✗ Error creating crawler: {e}")
            return False

In [9]:
# define S3 path for raw data
raw_data_s3_path = f"s3://{BUCKET_NAME}/{S3_PREFIX['raw']}/"
print(f"Crawler will scan: {raw_data_s3_path}")

# Create the crawler
create_glue_crawler(
    crawler_name=GLUE_CRAWLER_NAME,
    role_arn=GLUE_ROLE_ARN,
    database_name=GLUE_DATABASE_NAME,
    s3_target_path=raw_data_s3_path,
    table_prefix=""  # No prefix - table will be named 'raw'
)

Crawler will scan: s3://nfci-forecasting-306617143793/data/raw/
✓ Crawler 'nfci-raw-data-crawler' created successfully


True

#### Run crawler