# Part 3: Data Analytics

In [None]:
import os, json
import pandas as pd
import fsspec

os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""

bucket = "rearc-quest-tim"
csv_key = "pr.data.0.Current"
json_key = "datausa/2025-06-30.json"

storage = {"key": os.getenv("AWS_ACCESS_KEY_ID"),
           "secret": os.getenv("AWS_SECRET_ACCESS_KEY")}

In [None]:
bls = (
    pd.read_csv(f"s3://{bucket}/{csv_key}", sep="\t", storage_options=storage)
      .rename(columns=lambda c: c.strip())
      .assign(series_id=lambda df: df["series_id"].str.strip())
      .query("period.str.startswith('Q')", engine="python")
      .astype({"year": int, "value": float})
)

with fsspec.open(f"s3://{bucket}/{json_key}", **storage) as f:
    data = json.load(f)

pop = (
    pd.json_normalize(data["data"])[["Year", "Population"]]
      .rename(columns={"Year": "year", "Population": "population"})
      .astype({"year": int})
)

In [None]:
mask = pop["year"].between(2013, 2018)
mean_p = pop.loc[mask, "population"].mean()
std_p  = pop.loc[mask, "population"].std()

print(f"Mean population 2013-2018: {mean_p:,.0f}")
print(f"Std-dev population 2013-2018: {std_p:,.0f}")

In [None]:
### JUST TO TEST THE INPUT TABLE

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", None) 

# bls.head(20)

In [None]:
best_year = (
    bls.groupby(["series_id", "year"], as_index=False)
       .agg(year_sum=("value", lambda s: round(s.sum(), 1)))
       .sort_values(["series_id", "year_sum"], ascending=[True, False])
       .drop_duplicates("series_id", keep="first")
       .reset_index(drop=True)
)

best_year.head()

In [None]:
target = (
    bls.query("series_id == 'PRS30006032' and period == 'Q01'")
        .loc[:, ["series_id", "year", "period", "value"]]
)

joined = (
    target.merge(pop, on="year", how="left")
          .dropna(subset=["population"])
          .rename(columns={"population": "Population"})
          .astype({"Population": "int64"})
          .sort_values("year")
          .reset_index(drop=True)
)

joined.head()