In [13]:
import boto3
import json
import pandas as pd
from io import BytesIO

s3 = boto3.client('s3')
bucket = 'kinesis-lambda-s3-bucket1'  # your bucket
prefix = 'weather_data/2024-03-25/'  # or broader like 'weather_data/'

records = []

# List all JSON files under the prefix
objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

for obj in objects.get('Contents', []):
    key = obj['Key']
    if key.endswith('.json'):
        file = s3.get_object(Bucket=bucket, Key=key)
        content = file['Body'].read().decode('utf-8')
        data = json.loads(content)
        records.append(data)

# Convert to DataFrame
df = pd.DataFrame(records)

# Optional Cleaning: drop NA, select columns, convert datatypes
df_cleaned = df.dropna()
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])

# Save cleaned dataset as CSV back to S3
csv_buffer = BytesIO()
df_cleaned.to_csv(csv_buffer, index=False)

cleaned_key = 'cleaned/weather_data_cleaned.csv'
s3.put_object(Bucket=bucket, Key=cleaned_key, Body=csv_buffer.getvalue())

print(f"✅ Cleaned dataset saved to s3://{bucket}/{cleaned_key}")


✅ Cleaned dataset saved to s3://kinesis-lambda-s3-bucket1/cleaned/weather_data_cleaned.csv
