# BLS Data Analysis

This notebook contains the analysis of BLS data and population data, including:
1. Population statistics for years 2013-2018
2. Best years analysis for each series_id
3. Combined report for PRS30006032 and Q01

In [None]:
import os
import logging
import tempfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, year, when, max, mean, stddev, struct, trim
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import boto3
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Set Java environment
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@17"

# Initialize Spark Session with specific configurations
spark = SparkSession.builder \n    .appName("BLS Data Analysis") \n    .config("spark.driver.memory", "4g") \n    .config("spark.executor.memory", "4g") \n    .config("spark.sql.warehouse.dir", tempfile.mkdtemp()) \n    .getOrCreate()

# Initialize S3 client
s3_client = boto3.client("s3")
bucket_name = os.getenv("S3_BUCKET_NAME")
temp_dir = tempfile.mkdtemp()

# Print Spark version and configuration
print(f"Spark version: {spark.version}")
print(f"Java version: {os.environ.get("JAVA_HOME")}")