In [12]:
import json
import boto3
import pandas as pd
import os

In [13]:
RAW_BUCKET = "raw-data"
AWS_REGION = "us-east-1"
ENDPOINT_URL = "http://localhost:4566" # LocalStack endpoint

In [14]:
s3 = boto3.client("s3", endpoint_url=ENDPOINT_URL, region_name=AWS_REGION)

In [15]:
raw_data = "raw-data"
processed_bucket = "processed-data"

In [16]:
csv_objects = s3.list_objects_v2(Bucket=raw_data).get("Contents", [])
csv_files = [obj["Key"] for obj in csv_objects if obj["Key"] != "population_data.json"]

EndpointConnectionError: Could not connect to the endpoint URL: "http://host.docker.internal:4566/raw-data?list-type=2&encoding-type=url"

In [None]:
csv_dataframes = []
for key in csv_files:
    obj = s3.get_object(Bucket=raw_data, Key=key)
    df = pd.read_csv(obj["Body"])
    csv_dataframes.append(df)

In [None]:
if not csv_dataframes:
    print("No CSV files found in raw bucket.")
    csv_df = pd.DataFrame()
else:
    csv_df = pd.concat(csv_dataframes, ignore_index=True)

In [None]:
json_objects = s3.list_objects_v2(Bucket=processed_bucket).get("Contents", [])
json_files = [obj["Key"] for obj in json_objects if obj["Key"].endswith(".json")]

In [None]:
json_dataframes = []
for key in json_files:
    obj = s3.get_object(Bucket=processed_bucket, Key=key)
    json_content = json.load(obj["Body"])
    json_df = pd.DataFrame(json_content)
    json_dataframes.append(json_df)

In [None]:
if not json_dataframes:
    print("No JSON files found in processed bucket.")
    json_df = pd.DataFrame()
else:
    json_df = pd.concat(json_dataframes, ignore_index=True)

In [None]:
if not json_df.empty:
    pop_df = json_df.copy()
    pop_df = pop_df[(pop_df["year"] >= 2013) & (pop_df["year"] <= 2018)]
    mean_population = pop_df["population"].mean()
    std_population = pop_df["population"].std()
    print(f"Mean population (2013-2018): {mean_population}")
    print(f"Std deviation (2013-2018): {std_population}")

In [None]:
if not csv_df.empty:
    csv_df['value'] = pd.to_numeric(csv_df['value'], errors='coerce')
    csv_df['year'] = pd.to_numeric(csv_df['year'], errors='coerce')
    best_years = csv_df.groupby('series_id').apply(
        lambda x: x.groupby('year')['value'].sum().idxmax()
    ).reset_index(name='best_year')

    # Add summed value for best year
    best_years['sum_value'] = best_years.apply(
        lambda row: csv_df[(csv_df['series_id'] == row['series_id']) & (csv_df['year'] == row['best_year'])][
            'value'].sum(),
        axis=1
    )
    print("Best year per series_id with summed value:")
    print(best_years)

In [None]:
if not csv_df.empty and not json_df.empty:
    combined_df = csv_df[(csv_df['series_id'] == 'PRS30006032') & (csv_df['period'] == 'Q01')]
    combined_df = combined_df.merge(json_df[['year', 'population']], on='year', how='left')
    print("Combined report for PRS30006032, period Q01:")
    print(combined_df)