In [5]:
import boto3
import json
import pandas as pd
from io import BytesIO

s3 = boto3.client('s3')
bucket = 'kinesis-lambda-s3-bucket1'  # your bucket
prefix = 'weather_data/2025'  # or broader like 'weather_data/'

records = []

# List all JSON files under the prefix
objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

for obj in objects.get('Contents', []):
    key = obj['Key']
    if key.endswith('.json'):
        file = s3.get_object(Bucket=bucket, Key=key)
        content = file['Body'].read().decode('utf-8')
        data = json.loads(content)
        records.append(data)

# Convert to DataFrame
df = pd.DataFrame(records)

# Optional Cleaning: drop NA, select columns, convert datatypes
df_cleaned = df.dropna()
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])

# Save cleaned dataset as CSV back to S3
csv_buffer = BytesIO()
df_cleaned.to_csv(csv_buffer, index=False)

cleaned_key = 'cleaned/weather_data_cleaned.csv'
s3.put_object(Bucket=bucket, Key=cleaned_key, Body=csv_buffer.getvalue())

print(f"✅ Cleaned dataset saved to s3://{bucket}/{cleaned_key}")


✅ Cleaned dataset saved to s3://kinesis-lambda-s3-bucket1/cleaned/weather_data_cleaned.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])


In [1]:
import boto3
import json
import pandas as pd
from io import BytesIO

# AWS S3 Setup
s3 = boto3.client('s3')
bucket_name = 'kinesis-lambda-s3-bucket1'  # replace with your bucket
prefix = 'weather_data/'

all_records = []

# Step 1: List folders (date prefixes)
folders = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
for folder in folders.get('CommonPrefixes', []):
    folder_prefix = folder['Prefix']
    
    # Step 2: List all JSON files in the folder
    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
    for obj in objects.get('Contents', []):
        key = obj['Key']
        if key.endswith('.json'):
            file = s3.get_object(Bucket=bucket_name, Key=key)
            content = file['Body'].read().decode('utf-8')
            try:
                data = json.loads(content)
                all_records.append(data)
            except Exception as e:
                print(f"❌ Skipped invalid JSON in {key}: {e}")

# Step 3: Convert to DataFrame and clean
df = pd.DataFrame(all_records)

# Optional: drop rows with missing required values
df_cleaned = df.dropna(subset=['date', 'station'])

# Optional: convert date column to datetime type
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])

# Step 4: Upload merged CSV to S3
csv_buffer = BytesIO()
df_cleaned.to_csv(csv_buffer, index=False)

output_key = 'cleaned/cleaned_weather_dataset.csv'
s3.put_object(Bucket=bucket_name, Key=output_key, Body=csv_buffer.getvalue())

print(f"✅ Cleaned dataset uploaded to s3://{bucket_name}/{output_key}")


✅ Cleaned dataset uploaded to s3://kinesis-lambda-s3-bucket1/cleaned/cleaned_weather_dataset.csv


In [9]:
import boto3
import os
from pathlib import Path

# --- Configuration ---
bucket_name = 'kinesis-lambda-s3-bucket1'
downloads_dir = Path.home() / 'Downloads' / 's3_downloads'  # Creates a subfolder in Downloads

# --- Setup ---
s3 = boto3.client('s3')
downloads_dir.mkdir(parents=True, exist_ok=True)

# List and download files
response = s3.list_objects_v2(Bucket=bucket_name)

if 'Contents' in response:
    for obj in response['Contents']:
        key = obj['Key']
        local_file_path = downloads_dir / key
        local_file_path.parent.mkdir(parents=True, exist_ok=True)

        print(f"Downloading: {key} -> {local_file_path}")
        s3.download_file(bucket_name, key, str(local_file_path))
else:
    print("No files found in the bucket.")


Downloading: cleaned/cleaned_weather_dataset.csv -> C:\Users\DELL\Downloads\s3_downloads\cleaned\cleaned_weather_dataset.csv
Downloading: weather_data/2025-03-01/weather_1743644344.431772.json -> C:\Users\DELL\Downloads\s3_downloads\weather_data\2025-03-01\weather_1743644344.431772.json
Downloading: weather_data/2025-03-02/weather_1743644423.458344.json -> C:\Users\DELL\Downloads\s3_downloads\weather_data\2025-03-02\weather_1743644423.458344.json
Downloading: weather_data/2025-03-03/weather_1743644428.450659.json -> C:\Users\DELL\Downloads\s3_downloads\weather_data\2025-03-03\weather_1743644428.450659.json
Downloading: weather_data/2025-03-04/weather_1743644435.437231.json -> C:\Users\DELL\Downloads\s3_downloads\weather_data\2025-03-04\weather_1743644435.437231.json
Downloading: weather_data/2025-03-05/weather_1743644442.466469.json -> C:\Users\DELL\Downloads\s3_downloads\weather_data\2025-03-05\weather_1743644442.466469.json
Downloading: weather_data/2025-03-06/weather_1743644449.5814

In [10]:
import boto3
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket="kinesis-lambda-s3-bucket1", Prefix="models/")
for obj in response.get("Contents", []):
    print(obj['Key'])
