In [1]:
import boto3
import pandas as pd
import io
import re
import os
from collections import defaultdict
from tqdm import tqdm

# AWS S3 Configuration
bucket_name = 'pepper-dataset'
source_prefix = 'county-env-data/county_env_NetCDF4/'
target_prefix = 'Temp/'

s3 = boto3.client('s3')

# Step 1: List all CSV files under the source prefix
print("Listing all CSV files in S3...")
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=source_prefix)
files = []
for page in page_iterator:
    files += [content['Key'] for content in page.get('Contents', []) if content['Key'].endswith('.csv')]

Listing all CSV files in S3...


In [14]:
# Step 2: Organize files by state and year
organized = defaultdict(lambda: defaultdict(list))

for file_key in files:
    # Example key: county-env-data/county_env_NetCDF4/Arkansas_Jackson/Arkansas_Jackson_2014-08_part1.csv
    try:
        filename = os.path.basename(file_key)  # e.g., Arkansas_Jackson_2014-08_part1.csv
        parts = filename.split('_')
        if len(parts) < 3:
            continue
        state_county = f"{parts[0]}_{parts[1]}"  # Arkansas_Jackson
        year_month = parts[2]  # 2014-08
        year = year_month.split('-')[0]
        state = parts[0]  # Arkansas

        organized[state][year].append(file_key)

    except Exception as e:
        print(f"[WARN] Skipping unrecognized key: {file_key}, error: {e}")
        continue

In [None]:
# Step 3: Process and upload
for state, year_map in tqdm(organized.items(), desc="Processing states"):
    for year, file_list in tqdm(year_map.items(), desc=f"Processing {state} {year}", leave=False):
        monthly_data = defaultdict(dict)

        for file_key in file_list:
            try:
                filename = os.path.basename(file_key)
                parts = filename.split('_')
                state_county = f"{parts[0]}_{parts[1]}"
                ym = parts[2]
                part = parts[3].split('.')[0]  # e.g., part1

                obj = s3.get_object(Bucket=bucket_name, Key=file_key)
                df = pd.read_csv(io.BytesIO(obj['Body'].read()))
                df['datetime'] = pd.to_datetime(df['datetime'])
                monthly_data[(state_county, ym)][part] = df
            except Exception as e:
                print(f"[ERROR] Failed to read {file_key}: {e}")
                continue

        merged_all = []
        for (state_county, ym), parts in monthly_data.items():
            try:
                if 'part1' in parts and 'part2' in parts:
                    merged = pd.merge(parts['part1'], parts['part2'], on='datetime', suffixes=('_part1', '_part2'))
                elif 'part1' in parts:
                    merged = parts['part1']
                elif 'part2' in parts:
                    merged = parts['part2']
                else:
                    continue
                merged['county'] = state_county
                merged_all.append(merged)
            except Exception as e:
                print(f"[ERROR] Failed to merge {state_county} {ym}: {e}")
                continue

        if merged_all:
            final_df = pd.concat(merged_all, ignore_index=True)
            filename = f"{state}_counties_{year}.csv"
            local_path = f"/tmp/{filename}"

            try:
                final_df.to_csv(local_path, index=False)

                # Ensure file is not empty
                if not os.path.exists(local_path) or os.path.getsize(local_path) == 0:
                    raise Exception(f"[EMPTY] Local file is missing or empty: {local_path}")

                # Upload to S3
                s3.upload_file(local_path, bucket_name, f"{target_prefix}{filename}")
                print(f"[UPLOAD ✅] s3://{bucket_name}/{target_prefix}{filename} ({len(final_df)} rows)")

                # Double check upload
                resp = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{target_prefix}{filename}")
                found = any(obj['Key'] == f"{target_prefix}{filename}" for obj in resp.get('Contents', []))
                if not found:
                    raise Exception(f"[S3-MISS] File not found in S3 after upload: {filename}")

                # Remove local temp file
                os.remove(local_path)

            except Exception as e:
                print(f"[ERROR ❌] Upload failed for {filename}: {e}")

Processing states:   0%|          | 0/6 [00:00<?, ?it/s]
Processing Arkansas 2023:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing Arkansas 2023:   4%|▍         | 1/24 [00:15<05:57, 15.53s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2000.csv (35136 rows)



Processing Arkansas 2023:   8%|▊         | 2/24 [00:31<05:44, 15.68s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2001.csv (35040 rows)



Processing Arkansas 2023:  12%|█▎        | 3/24 [00:47<05:34, 15.94s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2002.csv (35040 rows)



Processing Arkansas 2023:  17%|█▋        | 4/24 [01:03<05:20, 16.01s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2003.csv (35040 rows)



Processing Arkansas 2023:  21%|██        | 5/24 [01:19<05:03, 15.99s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2004.csv (35136 rows)



Processing Arkansas 2023:  25%|██▌       | 6/24 [01:35<04:45, 15.88s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2005.csv (35040 rows)



Processing Arkansas 2023:  29%|██▉       | 7/24 [01:50<04:27, 15.74s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2006.csv (35040 rows)



Processing Arkansas 2023:  33%|███▎      | 8/24 [02:06<04:13, 15.83s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2007.csv (35040 rows)



Processing Arkansas 2023:  38%|███▊      | 9/24 [02:22<03:55, 15.73s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2008.csv (35136 rows)



Processing Arkansas 2023:  42%|████▏     | 10/24 [02:38<03:41, 15.82s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2009.csv (35040 rows)



Processing Arkansas 2023:  46%|████▌     | 11/24 [02:54<03:25, 15.78s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2010.csv (35040 rows)



Processing Arkansas 2023:  50%|█████     | 12/24 [03:09<03:10, 15.84s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2011.csv (35040 rows)



Processing Arkansas 2023:  54%|█████▍    | 13/24 [03:25<02:53, 15.76s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2012.csv (35136 rows)



Processing Arkansas 2023:  58%|█████▊    | 14/24 [03:40<02:36, 15.65s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2013.csv (35040 rows)



Processing Arkansas 2023:  62%|██████▎   | 15/24 [03:56<02:21, 15.72s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2014.csv (35040 rows)



Processing Arkansas 2023:  67%|██████▋   | 16/24 [04:12<02:05, 15.72s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2015.csv (35040 rows)



Processing Arkansas 2023:  71%|███████   | 17/24 [04:28<01:50, 15.80s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2016.csv (35136 rows)



Processing Arkansas 2023:  75%|███████▌  | 18/24 [04:44<01:35, 15.87s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2017.csv (35040 rows)



Processing Arkansas 2023:  79%|███████▉  | 19/24 [05:00<01:19, 15.83s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2018.csv (35040 rows)



Processing Arkansas 2023:  83%|████████▎ | 20/24 [05:15<01:02, 15.61s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2019.csv (35040 rows)



Processing Arkansas 2023:  88%|████████▊ | 21/24 [05:30<00:46, 15.57s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2020.csv (35136 rows)



Processing Arkansas 2023:  92%|█████████▏| 22/24 [05:45<00:30, 15.35s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2021.csv (35040 rows)



Processing Arkansas 2023:  96%|█████████▌| 23/24 [06:01<00:15, 15.58s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2022.csv (35040 rows)



Processing Arkansas 2023: 100%|██████████| 24/24 [06:17<00:00, 15.57s/it][A
Processing states:  17%|█▋        | 1/6 [06:17<31:28, 377.67s/it]        [A

[UPLOAD ✅] s3://pepper-dataset/Temp/Arkansas_counties_2023.csv (35040 rows)



Processing Illinois 2023:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing Illinois 2023:   4%|▍         | 1/24 [02:31<57:56, 151.16s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2000.csv (386496 rows)



Processing Illinois 2023:   8%|▊         | 2/24 [04:59<54:46, 149.41s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2001.csv (385440 rows)



Processing Illinois 2023:  12%|█▎        | 3/24 [07:21<51:11, 146.28s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2002.csv (385440 rows)



Processing Illinois 2023:  17%|█▋        | 4/24 [09:42<48:02, 144.11s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2003.csv (385440 rows)



Processing Illinois 2023:  21%|██        | 5/24 [12:04<45:20, 143.17s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2004.csv (386496 rows)



Processing Illinois 2023:  25%|██▌       | 6/24 [14:24<42:37, 142.11s/it][A

[UPLOAD ✅] s3://pepper-dataset/Temp/Illinois_counties_2005.csv (385440 rows)


In [12]:
merged_all

[]