In [1]:
import dask.dataframe as dd

In [37]:
# Load our data
df = dd.read_csv('weather_cleaned.csv')

In [38]:
df["date"] = dd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=['date'])
df = df[df["date"].dt.year != 1958]

In [40]:
df["date"] = dd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=['date'])
df = df[df["date"].dt.year != 1959]

In [41]:
min_date = df['date'].min().compute()
max_date = df['date'].max().compute()

print(f"Ngày bắt đầu có dữ liệu: {min_date}")
print(f"Ngày kết thúc dữ liệu: {max_date}")


Ngày bắt đầu có dữ liệu: 1960-01-01 00:00:00
Ngày kết thúc dữ liệu: 2025-06-03 21:30:00


In [42]:
df.head()

Unnamed: 0,date,year,month,day,tmp_c,dew_c,slp_hpa,vis_m,cloud_ceiling,wind_dir,wind_speed,station,call_sign,report_type,quality_control
0,1967-07-31 23:00:00,1967,7,31,26.8,21.8,1002.9,16000.0,2400.0,230.0,1.0,48855699999,IAHO,SAO,V020
1,1967-08-01 00:00:00,1967,8,1,27.4,22.4,1003.2,16000.0,2400.0,240.0,1.0,48855699999,IAHO,SAO,V020
2,1967-08-01 01:00:00,1967,8,1,29.1,21.8,1003.6,16000.0,22000.0,240.0,1.0,48855699999,IAHO,SAO,V020
3,1967-08-01 02:00:00,1967,8,1,30.2,22.4,1003.9,16000.0,22000.0,210.0,2.6,48855699999,IAHO,SAO,V020
4,1967-08-01 03:00:00,1967,8,1,31.3,21.3,1003.9,16000.0,22000.0,220.0,3.6,48855699999,IAHO,SAO,V020


In [43]:
df.dtypes

date                datetime64[ns]
year                         int64
month                        int64
day                          int64
tmp_c                      float64
dew_c                      float64
slp_hpa                    float64
vis_m                      float64
cloud_ceiling              float64
wind_dir                   float64
wind_speed                 float64
station                      int64
call_sign          string[pyarrow]
report_type        string[pyarrow]
quality_control    string[pyarrow]
dtype: object

In [17]:
import pandas as pd
df = pd.read_csv("weather_merged_2025.csv")


In [18]:
df.head()

Unnamed: 0,EQD,GF1,QUALITY_CONTROL,SLP,KA1,MW1,SOURCE,AA1,REPORT_TYPE,CIG,CALL_SIGN,DEW,WND,MD1,AY1,VIS,STATION,TMP,DATE,STATION_FOLDER
0,Q01 003SCCGA1,06991001999999999999999,V020,101831,,101.0,4,,FM-12,"99999,9,9,N",99999,1221,"360,1,N,0060,1","3,1,006,1,+999,9",21021.0,4000199,48839099999,1821,2025-01-01T00:00:00,BACH_LONG_VI_48839099999
1,,06991999999999999999999,V020,101921,,101.0,4,,FM-12,"99999,9,9,N",99999,1311,"050,1,N,0060,1","2,1,010,1,-031,1",21021.0,4000199,48839099999,2071,2025-01-01T03:00:00,BACH_LONG_VI_48839099999
2,,07991011999004501999999,V020,101661,,,4,,FM-12,"99999,9,9,N",99999,1351,"360,1,N,0050,1","8,1,026,1,+999,9",,10000199,48839099999,2131,2025-01-01T06:00:00,BACH_LONG_VI_48839099999
3,,06991999999999999999999,V020,101501,,101.0,4,,FM-12,"99999,9,9,N",99999,1401,"050,1,N,0040,1","6,1,016,1,-025,1",21021.0,4000199,48839099999,2011,2025-01-01T09:00:00,BACH_LONG_VI_48839099999
4,Q01 003SCCGA1,06991001999999999999999,V020,101591,,101.0,4,,FM-12,"99999,9,9,N",99999,1461,"070,1,N,0030,1","3,1,008,1,+999,9",21021.0,4000199,48839099999,1841,2025-01-01T12:00:00,BACH_LONG_VI_48839099999


In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv("weather_merged_2025.csv")
df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")


def parse_wnd(wnd):
    try:
        parts = str(wnd).split(',')
        return float(parts[0]), float(parts[3])
    except:
        return np.nan, np.nan

df[['wind_dir', 'wind_speed']] = df['WND'].apply(lambda x: pd.Series(parse_wnd(x)))

def parse_temp(temp_str):
    try:
        if pd.isna(temp_str): return np.nan
        temp = int(temp_str.split(',')[0]) / 10.0
        if temp == 999.9: return np.nan
        return temp
    except:
        return np.nan

df['tmp_c'] = df['TMP'].apply(parse_temp)

df['dew_c'] = df['DEW'].apply(parse_temp)
df['slp_hpa'] = df['SLP'].apply(parse_temp)

def parse_vis(vis_str):
    try:
        if pd.isna(vis_str): return np.nan
        return int(vis_str.split(',')[0])
    except:
        return np.nan

df['vis_m'] = df['VIS'].apply(parse_vis)

def parse_cig(cig_str):
    try:
        if pd.isna(cig_str): return np.nan
        return int(cig_str.split(',')[0]) * 30.48  
    except:
        return np.nan

df['cloud_ceiling'] = df['CIG'].apply(parse_cig)

df['year'] = df['DATE'].dt.year
df['month'] = df['DATE'].dt.month
df['day'] = df['DATE'].dt.day
df['hour'] = df['DATE'].dt.hour


clean_df = df[[
    'DATE', 'year', 'month', 'day', 'hour',
    'tmp_c', 'dew_c', 'slp_hpa', 'vis_m', 'cloud_ceiling',
    'wind_dir', 'wind_speed',
    'STATION', 'CALL_SIGN', 'STATION_FOLDER', 'REPORT_TYPE', 'QUALITY_CONTROL'
]]

clean_df.to_csv("weather_2025_cleaned.csv", index=False, encoding="utf-8")


In [13]:

columns_to_drop = [
    'TMP', 'DEW', 'SLP', 'WND', 'VIS', 'CIG',
    'AA1', 'AY1', 'GF1', 'KA1', 'MD1', 'MW1', 'EQD',
    'QUALITY_CONTROL', 'REPORT_TYPE', 'CALL_SIGN'
]

df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

final_df = df[[
    'DATE', 'year', 'month', 'day', 'hour',
    'tmp_c', 'dew_c', 'slp_hpa', 'vis_m', 'cloud_ceiling',
    'wind_dir', 'wind_speed',
    'STATION', 'STATION_FOLDER'
]]

final_df.to_csv("weather_2025_cleaned.csv", index=False, encoding="utf-8")
print("weather_2025_cleaned.csv")


weather_2025_cleaned.csv


In [14]:
min_date = df["DATE"].min()
max_date = df["DATE"].max()

print(f"Ngày bắt đầu có dữ liệu: {min_date}")
print(f"Ngày kết thúc dữ liệu: {max_date}")

Ngày bắt đầu có dữ liệu: 2025-01-01 00:00:00
Ngày kết thúc dữ liệu: 2025-08-23 21:30:00


In [15]:
import pandas as pd
df = pd.read_csv("weather_2025_cleaned.csv")

In [16]:
df.head()

Unnamed: 0,DATE,year,month,day,hour,tmp_c,dew_c,slp_hpa,vis_m,cloud_ceiling,wind_dir,wind_speed,STATION,STATION_FOLDER
0,2025-01-01 00:00:00,2025,1,1,0,18.2,12.2,1018.3,4000,3047969.52,360.0,60.0,48839099999,BACH_LONG_VI_48839099999
1,2025-01-01 03:00:00,2025,1,1,3,20.7,13.1,1019.2,4000,3047969.52,50.0,60.0,48839099999,BACH_LONG_VI_48839099999
2,2025-01-01 06:00:00,2025,1,1,6,21.3,13.5,1016.6,10000,3047969.52,360.0,50.0,48839099999,BACH_LONG_VI_48839099999
3,2025-01-01 09:00:00,2025,1,1,9,20.1,14.0,1015.0,4000,3047969.52,50.0,40.0,48839099999,BACH_LONG_VI_48839099999
4,2025-01-01 12:00:00,2025,1,1,12,18.4,14.6,1015.9,4000,3047969.52,70.0,30.0,48839099999,BACH_LONG_VI_48839099999


In [7]:
df.dtypes

DATE               object
year                int64
month               int64
day                 int64
hour                int64
tmp_c             float64
dew_c             float64
slp_hpa           float64
vis_m               int64
cloud_ceiling     float64
wind_dir          float64
wind_speed        float64
STATION             int64
STATION_FOLDER     object
dtype: object

In [9]:
df.head(100)

Unnamed: 0,DATE,year,month,day,hour,tmp_c,dew_c,slp_hpa,vis_m,cloud_ceiling,wind_dir,wind_speed,STATION,STATION_FOLDER
0,2025-01-01 00:00:00,2025,1,1,0,18.2,12.2,1018.3,4000,3047969.52,360.0,60.0,48839099999,BACH_LONG_VI_48839099999
1,2025-01-01 03:00:00,2025,1,1,3,20.7,13.1,1019.2,4000,3047969.52,50.0,60.0,48839099999,BACH_LONG_VI_48839099999
2,2025-01-01 06:00:00,2025,1,1,6,21.3,13.5,1016.6,10000,3047969.52,360.0,50.0,48839099999,BACH_LONG_VI_48839099999
3,2025-01-01 09:00:00,2025,1,1,9,20.1,14.0,1015.0,4000,3047969.52,50.0,40.0,48839099999,BACH_LONG_VI_48839099999
4,2025-01-01 12:00:00,2025,1,1,12,18.4,14.6,1015.9,4000,3047969.52,70.0,30.0,48839099999,BACH_LONG_VI_48839099999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2025-01-13 06:00:00,2025,1,13,6,20.4,11.7,1021.0,10000,670560.00,70.0,50.0,48839099999,BACH_LONG_VI_48839099999
96,2025-01-13 09:00:00,2025,1,13,9,18.7,9.6,1018.9,10000,3047969.52,50.0,50.0,48839099999,BACH_LONG_VI_48839099999
97,2025-01-13 12:00:00,2025,1,13,12,16.8,10.8,1020.0,10000,3047969.52,70.0,50.0,48839099999,BACH_LONG_VI_48839099999
98,2025-01-13 15:00:00,2025,1,13,15,17.6,10.7,1020.9,10000,3047969.52,110.0,30.0,48839099999,BACH_LONG_VI_48839099999
