In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
def merge_city_data(data_dict):
    """
    Gộp tất cả các DataFrame từ dictionary vào một DataFrame duy nhất.
    Thêm cột 'city' để xác định nguồn gốc dữ liệu.

    Parameters:
        data_dict (dict): Dictionary chứa DataFrame cho mỗi thành phố.

    Returns:
        pd.DataFrame: DataFrame đã được gộp.
    """
    dfs = []
    for city_name, df in data_dict.items():
        df_copy = df.copy()  # Tạo bản sao để không sửa đổi DataFrame gốc trong data_dict
        df_copy['city'] = city_name
        dfs.append(df_copy)
    merged_df = pd.concat(dfs, ignore_index=True)
    return merged_df

In [23]:
def load_data(data_folder):
    """
    Load tất cả các file CSV trong folder được chỉ định.
    Mỗi file được coi là dữ liệu của một thành phố.

    Parameters:
        data_folder (str): Đường dẫn đến thư mục chứa các file CSV.

    Returns:
        dict: Dictionary chứa DataFrame cho mỗi thành phố, với tên file làm key.
              Ví dụ: {'city1': DataFrame_city1, 'city2': DataFrame_city2, ...}
    """
    data_dict = {}
    for file in Path(data_folder).glob('*.csv'):
        city_name = file.stem
        try:
            df = pd.read_csv(file)
            df['datetime'] = pd.to_datetime(df['datetime'])
            data_dict[city_name] = df

        except Exception as e:
            print(f"Error loading data for {city_name}: {e}")
    return data_dict

In [50]:
data_dict = load_data('../datasets')

# Gộp dữ liệu của tất cả các thành phố vào một DataFrame duy nhất
df = merge_city_data(data_dict)

In [37]:
df.head(5)

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations,city
0,"Đà Nẵng, Việt Nam",2000-01-01,25.0,19.1,21.5,25.0,19.1,21.5,19.6,89.3,...,,,06:14:40,17:25:53,0.82,"Rain, Partially cloudy",Partly cloudy throughout the day with afternoo...,rain,"['48855099999', '48852099999']",DaNang
1,"Đà Nẵng, Việt Nam",2000-01-02,22.1,19.2,21.0,22.1,19.2,21.0,19.7,92.4,...,,,06:15:01,17:26:28,0.85,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,"['48855099999', '48852099999']",DaNang
2,"Đà Nẵng, Việt Nam",2000-01-03,25.1,18.7,21.5,25.1,18.7,21.5,19.4,88.7,...,,,06:15:23,17:27:04,0.88,"Rain, Partially cloudy",Partly cloudy throughout the day with early mo...,rain,"['48855099999', '48852099999']",DaNang
3,"Đà Nẵng, Việt Nam",2000-01-04,26.1,17.9,22.9,26.1,17.9,22.9,19.9,84.7,...,,,06:15:43,17:27:39,0.92,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"['48855099999', '48852099999']",DaNang
4,"Đà Nẵng, Việt Nam",2000-01-05,28.0,19.9,23.8,29.7,19.9,24.0,20.9,85.5,...,,,06:16:02,17:28:15,0.95,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"['48855099999', '48852099999']",DaNang


In [52]:
keep_columns = ['datetime', 'temp', 'tempmax', 'tempmin', 'humidity', 'precip', 'cloudcover', 'visibility', 'city']

In [53]:
df = df[keep_columns]
df.columns

Index(['datetime', 'temp', 'tempmax', 'tempmin', 'humidity', 'precip',
       'cloudcover', 'visibility', 'city'],
      dtype='object')

In [54]:
df = df[df['datetime'] >= '2010-01-01']     
# Đảm bảo cột datetime ở dạng datetime object
df['datetime'] = pd.to_datetime(df['datetime'])

# Tạo 2 cột mới
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df = pd.get_dummies(df, columns=['city'])


In [55]:
df.head(5)

Unnamed: 0,datetime,temp,tempmax,tempmin,humidity,precip,cloudcover,visibility,day,month,city_DaNang,city_Hanoi,city_HoChiMinh
3653,2010-01-01,23.8,28.0,20.1,81.9,0.0,67.5,8.3,1,1,True,False,False
3654,2010-01-02,24.2,28.6,20.0,84.1,0.0,58.2,7.4,2,1,True,False,False
3655,2010-01-03,24.1,26.2,21.0,84.9,0.004,82.3,6.3,3,1,True,False,False
3656,2010-01-04,24.4,28.1,20.1,81.0,0.0,69.1,7.8,4,1,True,False,False
3657,2010-01-05,24.4,28.6,19.7,81.9,0.0,58.5,7.6,5,1,True,False,False


In [56]:
df.isna().sum()

datetime          0
temp              0
tempmax           0
tempmin           0
humidity          0
precip            0
cloudcover        0
visibility        0
day               0
month             0
city_DaNang       0
city_Hanoi        0
city_HoChiMinh    0
dtype: int64

In [57]:
df = df.sort_values(by="datetime")
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.15)

train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size + val_size]
test_df = df.iloc[train_size + val_size:]

print(f"Train: {len(train_df)} samples, from {train_df['datetime'].min()} to {train_df['datetime'].max()}")
print(f"Validation: {len(val_df)} samples, from {val_df['datetime'].min()} to {val_df['datetime'].max()}")
print(f"Test: {len(test_df)} samples, from {test_df['datetime'].min()} to {test_df['datetime'].max()}")


Train: 11603 samples, from 2010-01-01 00:00:00 to 2020-08-03 00:00:00
Validation: 2486 samples, from 2020-08-03 00:00:00 to 2022-11-10 00:00:00
Test: 2487 samples, from 2022-11-10 00:00:00 to 2025-02-13 00:00:00


In [58]:
import os

# Tạo thư mục nếu chưa tồn tại
output_dir = "../datasets/train_val_test/"
os.makedirs(output_dir, exist_ok=True)

# Xuất các file CSV
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)

print(f"Files saved to {output_dir}")

Files saved to ../datasets/train_val_test/
