In [14]:
pip install pandas pyarrow fsspec s3fs

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [41]:
import requests
import pandas as pd
from datetime import datetime
import pytz

# API endpoint and parameters
WEATHER_ENDPOINT = "https://api.openweathermap.org/data/2.5/weather"
API_KEY = "70e208d9d8ba1534136297fb1f3fe396"  # Replace with your actual API key


locations = {
    "Satitram Alumni": {"lat": 13.754174, "lon": 100.615676},
}

def get_weather_data(location_name='Satitram Alumni'):
    lat = locations[location_name]['lat']
    lon = locations[location_name]['lon']

    params = {
        "lat": lat,
        "lon": lon,
        "appid": API_KEY,
        "units": "metric",
        "lang": "th"
    }
    try:
        # Make API request
        response = requests.get(WEATHER_ENDPOINT, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        
        # Convert timestamp to datetime
        # created_at = datetime.fromtimestamp(data['dt'])

        dt = datetime.now()
        thai_tz = pytz.timezone('Asia/Bangkok')
        created_at = dt.replace(tzinfo=thai_tz)


        timestamp = datetime.now()
        
        # Create dictionary with required fields
        weather_dict = {
            'timestamp': timestamp,
            'year': timestamp.year,
            'month': timestamp.month,
            'day': timestamp.day,
            'hour': timestamp.hour,
            'minute': timestamp.minute,
            'created_at': created_at,
            'location': location_name,
            'temperature': data['main']['temp'],
            'feels_like': data['main']['feels_like'],
            'humidity': data['main']['humidity'],
            'pressure': data['main']['pressure'],
            'wind_speed': data['wind']['speed'],
            'visibility': data.get('visibility'),
            'weather_main': data['weather'][0]['main'],
            'weather_description': data['weather'][0]['description']
        }
        
        # Create DataFrame
        # df = pd.DataFrame([weather_dict])
        
        # return df
        return weather_dict
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None
    except KeyError as e:
        print(f"Error processing data: Missing key {e}")
        return None

In [42]:
df=pd.DataFrame([get_weather_data(p) for p in list(locations.keys())])
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype                       
---  ------               --------------  -----                       
 0   timestamp            1 non-null      datetime64[ns]              
 1   year                 1 non-null      int64                       
 2   month                1 non-null      int64                       
 3   day                  1 non-null      int64                       
 4   hour                 1 non-null      int64                       
 5   minute               1 non-null      int64                       
 6   created_at           1 non-null      datetime64[ns, Asia/Bangkok]
 7   location             1 non-null      object                      
 8   temperature          1 non-null      float64                     
 9   feels_like           1 non-null      float64                     
 10  humidity             1 non-null      int64

Unnamed: 0,timestamp,year,month,day,hour,minute,created_at,location,temperature,feels_like,humidity,pressure,wind_speed,visibility,weather_main,weather_description
0,2025-05-09 09:01:29.866781,2025,5,9,9,1,2025-05-09 09:19:29.866494+07:00,Satitram Alumni,35.3,42.3,58,1003,4.12,10000,Thunderstorm,พายุฟ้าคะนอง


In [19]:

dt = datetime.now()
thai_tz = pytz.timezone('Asia/Bangkok')
dt = dt.replace(tzinfo=thai_tz)
print(dt) 

2025-05-09 08:43:12.246090+06:42


In [20]:
import pandas as pd

# lakeFS credentials from your docker-compose.yml
ACCESS_KEY = "access_key"
SECRET_KEY = "secret_key"

# lakeFS endpoint (running locally)
lakefs_endpoint = "http://lakefs-dev:8000/"

# lakeFS repository, branch, and file path
repo = "weather"
branch = "main"
path = "weather.parquet"

# Construct the full lakeFS S3-compatible path
lakefs_s3_path = f"s3a://{repo}/{branch}/{path}"

# Configure storage_options for lakeFS (S3-compatible)
storage_options = {
    "key": ACCESS_KEY,
    "secret": SECRET_KEY,
    "client_kwargs": {
        "endpoint_url": lakefs_endpoint
    }
}

In [23]:
df.to_parquet(
    lakefs_s3_path,
    storage_options=storage_options,
    partition_cols=['year','month','day','hour'],
    
)

# test read parquet files

In [25]:
pip install fsspec

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [43]:
fs.ls("s3a://weather/main/weather.parquet/")

['weather/main/weather.parquet/year=2025']

In [44]:
fs.ls("s3a://weather/main/weather.parquet/year=2025/")

['weather/main/weather.parquet/year=2025/month=5']

In [45]:
fs.ls("s3a://weather/main/weather.parquet/year=2025/month=5/day=8/")
fs.ls("s3a://weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/")

['weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/439dca34b97444a38671facd7d64e596-0.parquet',
 'weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/5843a0197e6f4d3385f0c27edc89b412-0.parquet',
 'weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/96d8715177f747f78b6bc8209d5b4109-0.parquet',
 'weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/d99fcd42f2fd410dbacae4f65539620a-0.parquet',
 'weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/fa7200c1242c4859bd91000325b41611-0.parquet']

In [48]:
import pandas as pd

# ค้นหาพาธที่มีไฟล์จริงทั้งหมด
parquet_paths = fs.glob("weather/main/weather.parquet/year=*/month=*/day=*/hour=*/*.parquet")

# โหลดแต่ไฟล์ที่มีอยู่จริง
dfs = [pd.read_parquet(f"s3a://{path}", storage_options=storage_options) for path in parquet_paths]
df2 = pd.concat(dfs, ignore_index=True)

In [49]:
paths = fs.glob("weather/main/weather.parquet/year=*/month=*/day=*/hour=*/*.parquet")
missing = [p for p in paths if not fs.isfile(p)]

print(f"Missing files ({len(missing)}):")
for m in missing:
    print("-", m)

Missing files (0):


In [51]:
import pandas as pd

# 1. ดึงพาธของทุกไฟล์ .parquet ที่อยู่ในโครงสร้าง partition
all_paths = fs.glob("weather/main/weather.parquet/year=*/month=*/day=*/hour=*/*.parquet")

# 2. กรองเฉพาะพาธที่มีไฟล์อยู่จริง
existing_paths = [p for p in all_paths if fs.isfile(p)]

# 3. โหลดทุกไฟล์ที่เจอแล้วรวมกัน
dfs = [
    pd.read_parquet(f"s3a://{path}", storage_options=storage_options)
    for path in existing_paths
]
df2 = pd.concat(dfs, ignore_index=True)

# 4. ดูข้อมูล
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2491 entries, 0 to 2490
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype                       
---  ------               --------------  -----                       
 0   timestamp            2491 non-null   datetime64[ns]              
 1   minute               2489 non-null   float64                     
 2   created_at           2489 non-null   datetime64[ns, Asia/Bangkok]
 3   requested_province   2487 non-null   object                      
 4   location             2491 non-null   object                      
 5   weather_main         2489 non-null   object                      
 6   weather_description  2489 non-null   object                      
 7   main.temp            2487 non-null   float64                     
 8   year                 2491 non-null   category                    
 9   month                2491 non-null   category                    
 10  day                  2491 non-null  

Unnamed: 0,timestamp,minute,created_at,requested_province,location,weather_main,weather_description,main.temp,year,month,day,hour,temperature,feels_like,humidity,pressure,wind_speed,visibility,lat,lon
0,2025-05-01 11:31:10.350466,31.0,2025-05-01 11:49:10.333970+07:00,Pathum Thani,Pathum Thani,Clouds,few clouds,33.15,2025,5,1,11,,,,,,,,
1,2025-05-01 11:31:10.717529,31.0,2025-05-01 11:49:10.717493+07:00,Bangkok,Bangkok,Clouds,overcast clouds,33.94,2025,5,1,11,,,,,,,,
2,2025-05-01 11:31:11.057869,31.0,2025-05-01 11:49:11.057842+07:00,Chiang Mai,Chiang Mai,Clouds,few clouds,31.28,2025,5,1,11,,,,,,,,
3,2025-05-01 11:31:11.341738,31.0,2025-05-01 11:49:11.341696+07:00,Phuket,Kathu,Rain,moderate rain,25.92,2025,5,1,11,,,,,,,,
4,2025-05-01 11:45:52.508233,45.0,2025-05-01 12:03:52.508159+07:00,Pathum Thani,Pathum Thani,Clouds,few clouds,32.89,2025,5,1,11,,,,,,,,


In [36]:
path_single_partition = 's3a://weather/main/weather.parquet/year=2025/month=5/day=1/hour=11/8aeb52f0592d41f08e38c309edb03084-0.parquet'

df2=pd.read_parquet(    
    path=path_single_partition,
    storage_options=storage_options,
)
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype                       
---  ------               --------------  -----                       
 0   timestamp            4 non-null      datetime64[ns]              
 1   minute               4 non-null      int64                       
 2   created_at           4 non-null      datetime64[ns, Asia/Bangkok]
 3   requested_province   4 non-null      object                      
 4   location             4 non-null      object                      
 5   weather_main         4 non-null      object                      
 6   weather_description  4 non-null      object                      
 7   main.temp            4 non-null      float64                     
 8   year                 4 non-null      category                    
 9   month                4 non-null      category                    
 10  day                  4 non-null      categ

Unnamed: 0,timestamp,minute,created_at,requested_province,location,weather_main,weather_description,main.temp,year,month,day,hour
0,2025-05-01 11:31:10.350466,31,2025-05-01 11:49:10.333970+07:00,Pathum Thani,Pathum Thani,Clouds,few clouds,33.15,2025,5,1,11
1,2025-05-01 11:31:10.717529,31,2025-05-01 11:49:10.717493+07:00,Bangkok,Bangkok,Clouds,overcast clouds,33.94,2025,5,1,11
2,2025-05-01 11:31:11.057869,31,2025-05-01 11:49:11.057842+07:00,Chiang Mai,Chiang Mai,Clouds,few clouds,31.28,2025,5,1,11
3,2025-05-01 11:31:11.341738,31,2025-05-01 11:49:11.341696+07:00,Phuket,Kathu,Rain,moderate rain,25.92,2025,5,1,11


In [52]:
print(dt)

2025-05-09 08:43:12.246090+06:42


# Test Duck and Dask

In [58]:
lakefs_endpoint = "host.docker.internal:8001"  # หรือ IP เช่น "192.168.1.10:8001"

In [60]:
!aws --endpoint-url=http://localhost:8001 s3 ls s3://weather/main/weather.parquet/

/usr/bin/sh: 1: aws: not found


In [67]:
import duckdb

con = duckdb.connect(database=':memory:')

ACCESS_KEY = "access_key"
SECRET_KEY = "secret_key"
lakefs_endpoint = "host.docker.internal:8001"

query = f"""
INSTALL httpfs;
LOAD httpfs;

SET s3_endpoint='{lakefs_endpoint}';
SET s3_access_key_id='{ACCESS_KEY}'; 
SET s3_secret_access_key='{SECRET_KEY}'; 
SET s3_url_style='path';
SET s3_use_ssl=false;

SELECT * 
FROM read_parquet(
    's3://weather/main/weather.parquet/**/*.parquet',
    filename=true,
    hive_partitioning=true,
    union_by_name=true
);
"""

df_duck = con.execute(query).df()
print("DuckDB Parquet Query Result:")
df_duck.info()
df_duck.head(20)

DuckDB Parquet Query Result:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype                  
---  ------               --------------  -----                  
 0   timestamp            2500 non-null   datetime64[ns]         
 1   minute               2498 non-null   Int64                  
 2   created_at           2498 non-null   datetime64[us, Etc/UTC]
 3   requested_province   2496 non-null   object                 
 4   location             2500 non-null   object                 
 5   weather_main         2498 non-null   object                 
 6   weather_description  2498 non-null   object                 
 7   main.temp            2496 non-null   float64                
 8   temperature          4 non-null      float64                
 9   feels_like           2 non-null      float64                
 10  humidity             4 non-null      Int64                  
 11  p

Unnamed: 0,timestamp,minute,created_at,requested_province,location,weather_main,weather_description,main.temp,temperature,feels_like,...,pressure,wind_speed,visibility,lat,lon,filename,day,hour,month,year
0,2025-05-01 11:31:10.350466,31,2025-05-01 04:49:10.333970+00:00,Pathum Thani,Pathum Thani,Clouds,few clouds,33.15,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
1,2025-05-01 11:31:10.717529,31,2025-05-01 04:49:10.717493+00:00,Bangkok,Bangkok,Clouds,overcast clouds,33.94,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
2,2025-05-01 11:31:11.057869,31,2025-05-01 04:49:11.057842+00:00,Chiang Mai,Chiang Mai,Clouds,few clouds,31.28,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
3,2025-05-01 11:31:11.341738,31,2025-05-01 04:49:11.341696+00:00,Phuket,Kathu,Rain,moderate rain,25.92,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
4,2025-05-01 11:45:52.508233,45,2025-05-01 05:03:52.508159+00:00,Pathum Thani,Pathum Thani,Clouds,few clouds,32.89,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
5,2025-05-01 11:45:53.034224,45,2025-05-01 05:03:53.034176+00:00,Bangkok,Bangkok,Clouds,overcast clouds,33.83,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
6,2025-05-01 11:45:53.316882,45,2025-05-01 05:03:53.316867+00:00,Chiang Mai,Chiang Mai,Clouds,few clouds,31.46,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
7,2025-05-01 11:45:53.578773,45,2025-05-01 05:03:53.578760+00:00,Phuket,Kathu,Clouds,broken clouds,24.92,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
8,2025-05-01 11:31:10.350466,31,2025-05-01 04:49:10.333970+00:00,Pathum Thani,Pathum Thani,Clouds,few clouds,33.15,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025
9,2025-05-01 11:31:10.717529,31,2025-05-01 04:49:10.717493+00:00,Bangkok,Bangkok,Clouds,overcast clouds,33.94,,,...,,,,,,s3://weather/main/weather.parquet/year=2025/mo...,1,11,5,2025


In [89]:
!pip install dask

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [90]:
import dask.dataframe as dd
df2 = dd.read_parquet(
    path=path_all_partition,
    storage_options=storage_options,
    dtype_backend='pyarrow'
)  

FileNotFoundError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: weather/main/weather.parquet/year=2025/month=5/day=8/hour=6/dd1741ec35434825a62561e9625ed3e8-0.parquet

In [91]:
import os

# ตรวจสอบพาธที่ไฟล์ถูกบันทึก
output_dir = '/home/jovyan/work/weather/main/weather.parquet'
os.listdir(output_dir)  # ตรวจสอบว่าไฟล์มีอยู่ในที่ตั้งนั้นหรือไม่

['test.parquet']

In [92]:
print(path_all_partition)  # ดูว่าพาธที่ใช้ถูกต้องหรือไม่

s3a://weather/main/weather.parquet


In [94]:
path_all_partition = '/home/jovyan/work/weather/main/weather.parquet/'