In [None]:
import os
import pandas as pd 

META_DATA_PATH = '../data/solar_energy/meta_data.csv' 
final_data_path = '../data/concat_data'
meta_data = pd.read_csv(META_DATA_PATH)
files = os.listdir(final_data_path)
sample = files[2]
print(f"Sample file: {sample}")
file = pd.read_parquet(os.path.join(final_data_path, sample))
columns = file.columns
print(f"Columns in {sample}: {columns}")

Sample file: 삼척소내_2.parquet
Columns in 삼척소내_2.parquet: Index(['date', '호기', '총량(kw)', '평균(kw)', '최대(kw)', '최소(kw)', '최대(시간별_kw)',
       '최소(시간별_kw)', 'value', 'time', 'temperature', 'humidity', 'rn', 'ws',
       'wd', 'pv', 'pa', 'ps', 'ss', 'icsr', 'dc10Tca', 'dc10LmcsCa', 'lcsCh',
       'vs', 'ts', 'sunrise', 'sunset', 'SO2', 'CO', 'O3', 'NO2', 'PM10',
       'PM25', '미세먼지', '초미세먼지'],
      dtype='object')


In [81]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import datetime
from tqdm import tqdm

def load_and_preprocess_data(meta_data_path, data_dir):
    """
    Load and preprocess all solar plant data files
    
    Args:
        meta_data_path: Path to metadata CSV
        data_dir: Directory containing parquet files
    
    Returns:
        processed_data: Dictionary of preprocessed dataframes by plant
        feature_cols: List of feature columns
        plants_info: Dictionary with information about each plant
    """
    # Load metadata
    meta_data = pd.read_csv(meta_data_path)
    
    # Get all files
    files = os.listdir(data_dir)
    
    # Initialize dictionaries to store processed data
    processed_data = {}
    plants_info = {}
    
    # Define column types
    time_cols = ['date', 'time', 'sunrise', 'sunset']
    
    # Weather/environment columns (numerical)
    weather_numeric_cols = [
        'temperature', 'humidity', 'ws', 'pv', 'pa', 'ps', 
        'ss', 'dc10Tca', 'dc10LmcsCa', 'vs', 'ts'
    ]
    
    # Circular features (need sin/cos transformation)
    circular_cols = ['wd']
    
    # Weather columns with special imputation (zero)
    weather_zero_impute_cols = ['rn', 'icsr']
    
    # Categorical weather columns
    categorical_cols = ['lcsCh', '미세먼지', '초미세먼지']
    
    # Air quality columns (numerical)
    air_numeric_cols = ['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']
    
    # Solar plant output columns 
    plant_output_cols = ['총량(kw)', '평균(kw)', '최대(kw)', '최소(kw)', '최대(시간별_kw)', '최소(시간별_kw)', 'value']
    
    # Target column
    target_col = 'value'
    
    # All numeric columns for imputation
    all_numeric_cols = weather_numeric_cols + air_numeric_cols + plant_output_cols
    
    # Analyze all files to collect plant information
    print("Collecting plant information...")
    for file_name in tqdm(files):
        plant_id = file_name.split('.')[0]  # Assuming filename contains plant ID
        file_path = os.path.join(data_dir, file_name)
        
        # Load data - exclude '호기' column from the beginning
        df = pd.read_parquet(file_path)
        if '호기' in df.columns:
            df = df.drop(columns=['호기'])
        
        # Extract date range information
        if 'date' in df.columns:
            min_date = df['date'].min()
            max_date = df['date'].max()
            date_range = f"{min_date} ~ {max_date}"
        else:
            date_range = "Unknown"
        
        # Store plant information
        plants_info[plant_id] = {
            'file_name': file_name,
            'date_range': date_range,
            'data_length': len(df),
            'columns': list(df.columns)
        }
    
    print(f"Found {len(plants_info)} plants")
    
    # Process each file
    print("Processing each plant's data...")
    for file_name in tqdm(files):
        plant_id = file_name.split('.')[0]
        file_path = os.path.join(data_dir, file_name)
        
        # Load data - exclude '호기' column
        df = pd.read_parquet(file_path)
        if '호기' in df.columns:
            df = df.drop(columns=['호기'])
        
        # Process datetime columns - 시간 관련 특성 생성
        df = process_time_features(df, time_cols)
        
        # Process special weather features (wd, etc.)
        df = process_weather_features(df, circular_cols)
        
        # Handle missing values according to the provided strategy
        df = handle_missing_values(
            df, 
            numeric_cols=all_numeric_cols,
            zero_impute_cols=weather_zero_impute_cols,
            categorical_cols=categorical_cols,
            circular_cols=['wd_sin', 'wd_cos']
        )
        
        # Save to dictionary
        processed_data[plant_id] = df
    
    # Define feature columns
    if len(processed_data) > 0:
        sample_df = processed_data[list(processed_data.keys())[0]]
        
        # Target column is the hourly generation ('value')
        target_col = 'value'
        
        # Excluded columns (raw time columns and derived intermediate columns)
        excluded_cols = time_cols + ['datetime', 'hour', 'day_of_year', 'month', target_col]
        
        # All features except target and excluded time columns
        feature_cols = [col for col in sample_df.columns if col not in excluded_cols]
    else:
        feature_cols = []
    
    return processed_data, feature_cols, target_col, plants_info

def process_time_features(df, time_cols):
    """Process time-related columns and extract useful features"""
    
    # Convert date and time to datetime if they exist
    if 'date' in df.columns and 'time' in df.columns:
        # Ensure date and time are string type
        df['date'] = df['date'].astype(str)
        df['time'] = df['time'].astype(str)
        
        # Create datetime column
        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
        df['hour'] = df['datetime'].dt.hour
        
        # Create cyclical time features
        df['time_hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
        df['time_hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
        
        # Day of year (for seasonal patterns)
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['time_day_sin'] = np.sin(2 * np.pi * df['day_of_year']/365)
        df['time_day_cos'] = np.cos(2 * np.pi * df['day_of_year']/365)
        
        # Month as a cyclical feature (for seasonal patterns)
        df['month'] = df['datetime'].dt.month
        df['time_month_sin'] = np.sin(2 * np.pi * df['month']/12)
        df['time_month_cos'] = np.cos(2 * np.pi * df['month']/12)
        
        # Create features for day/night based on sunrise/sunset
        if 'sunrise' in df.columns and 'sunset' in df.columns:
            try:
                # Convert sunrise/sunset to datetime
                df['sunrise'] = pd.to_datetime(df['date'] + ' ' + df['sunrise'], errors='coerce')
                df['sunset'] = pd.to_datetime(df['date'] + ' ' + df['sunset'], errors='coerce')
                
                # Is daylight (1 if current time is between sunrise and sunset)
                df['time_is_daylight'] = ((df['datetime'] >= df['sunrise']) & 
                                     (df['datetime'] <= df['sunset'])).astype(int)
                
                # Hours since sunrise and hours until sunset
                df['time_hours_since_sunrise'] = (df['datetime'] - df['sunrise']).dt.total_seconds() / 3600
                df['time_hours_until_sunset'] = (df['sunset'] - df['datetime']).dt.total_seconds() / 3600
                
                # Replace negative values with 0 (before sunrise or after sunset)
                df['time_hours_since_sunrise'] = df['time_hours_since_sunrise'].clip(lower=0)
                df['time_hours_until_sunset'] = df['time_hours_until_sunset'].clip(lower=0)
            except Exception as e:
                print(f"Error processing sunrise/sunset: {e}")
    
    return df

def process_weather_features(df, circular_cols):
    """특수한 날씨 피처들에 대한 전처리"""
    
    # 풍향(wd)의 순환적 특성 처리
    if 'wd' in df.columns and 'wd' in circular_cols:
        # 풍향이 0-360도 범위인지 확인
        max_wd = df['wd'].max()
        if pd.notna(max_wd):  # NaN 체크
            if max_wd <= 360:
                # 사인/코사인 변환
                df['wd_sin'] = np.sin(2 * np.pi * df['wd'] / 360)
                df['wd_cos'] = np.cos(2 * np.pi * df['wd'] / 360)
            # 만약 다른 범위라면 적절히 조정
            else:
                df['wd_sin'] = np.sin(2 * np.pi * df['wd'] / max_wd)
                df['wd_cos'] = np.cos(2 * np.pi * df['wd'] / max_wd)
    
    return df

def handle_missing_values(df, numeric_cols, zero_impute_cols, categorical_cols, circular_cols):
    """
    Handle missing values based on the provided strategy
    
    Strategy:
    1. For numeric weather features: Consider time pattern and use forward filling, 
       then anomaly imputation (median)
    2. For rn, icsr: Fill NaN with 0
    3. For categorical values: Forward fill, then mode imputation
    4. For circular features (wd_sin, wd_cos): Handle as numeric
    """
    
    # First identify all columns that exist in the dataframe
    existing_numeric_cols = [col for col in numeric_cols if col in df.columns]
    existing_zero_impute_cols = [col for col in zero_impute_cols if col in df.columns]
    existing_cat_cols = [col for col in categorical_cols if col in df.columns]
    existing_circular_cols = [col for col in circular_cols if col in df.columns]
    
    # Group data by hour to capture daily patterns (if datetime exists)
    if 'datetime' in df.columns and 'hour' in df.columns:
        # Handle numeric columns with time-based strategy
        for col in existing_numeric_cols + existing_circular_cols:
            # First try forward fill (for small gaps)
            df[col] = df.groupby(['hour'])[col].transform(lambda x: x.ffill())
            
            # For remaining NaNs, use median by hour
            hourly_medians = df.groupby(['hour'])[col].transform('median')
            df[col] = df[col].fillna(hourly_medians)
            
            # If still NaN, use overall median
            df[col] = df[col].fillna(df[col].median())
    else:
        # Fallback to simple median imputation
        for col in existing_numeric_cols + existing_circular_cols:
            df[col] = df[col].fillna(df[col].median())
    
    # Handle zero-imputation columns
    for col in existing_zero_impute_cols:
        df[col] = df[col].fillna(0)
    
    # Handle categorical columns
    for col in existing_cat_cols:
        # First try forward fill 
        df[col] = df[col].ffill()
        
        # For remaining NaNs, use mode
        if not df[col].mode().empty:
            df[col] = df[col].fillna(df[col].mode().iloc[0])
        else:
            df[col] = df[col].fillna('unknown')
    
    return df

def prepare_time_series_data(processed_data, feature_cols, target_col='value', window_size=1, prediction_size=1):
    """
    시계열 예측을 위한 슬라이딩 윈도우 데이터 준비
    
    Args:
        processed_data: 전처리된 데이터 딕셔너리
        feature_cols: 입력 피처 칼럼 목록
        target_col: 예측할 타겟 칼럼 ('value')
        window_size: 윈도우 크기(일) - 전날 데이터
        prediction_size: 예측 대상 크기(일) - 당일 데이터
        
    Returns:
        time_series_data: 시계열 데이터 딕셔너리 {plant_id: {'X': features, 'y': targets, 'dates': dates}}
    """
    time_series_data = {}
    
    # 각 발전소별 데이터 처리
    for plant_id, df in processed_data.items():
        if 'datetime' not in df.columns or target_col not in df.columns:
            continue
        
        # 날짜별로 데이터 정렬
        df = df.sort_values('datetime')
        
        # 날짜만 추출하여 고유한 날짜 목록 생성
        df['date_only'] = df['datetime'].dt.date
        unique_dates = df['date_only'].unique()
        
        if len(unique_dates) <= window_size + prediction_size:
            print(f"Plant {plant_id} has insufficient data: {len(unique_dates)} days")
            continue
        
        # 특성과 타겟 데이터 준비
        X_data = []
        y_data = []
        dates_data = []
        
        # 각 날짜별로 24시간 데이터 그룹화
        date_groups = {}
        for date in unique_dates:
            date_df = df[df['date_only'] == date]
            if len(date_df) == 24:  # 하루에 24시간 데이터가 모두 있는 경우만 사용
                date_groups[date] = date_df
        
        # 날짜 그룹을 시간순으로 정렬
        sorted_dates = sorted(date_groups.keys())
        
        # 슬라이딩 윈도우 방식으로 데이터 생성
        for i in range(len(sorted_dates) - window_size - prediction_size + 1):
            # 입력 윈도우 (전날 데이터)
            input_dates = sorted_dates[i:i+window_size]
            input_dfs = [date_groups[date] for date in input_dates]
            
            # 예측 대상 (당일 데이터)
            output_dates = sorted_dates[i+window_size:i+window_size+prediction_size]
            output_dfs = [date_groups[date] for date in output_dates]
            
            # 입력 특성 데이터 (전날의 모든 특성)
            X = pd.concat(input_dfs)[feature_cols].values
            
            # 예측 타겟 데이터 (당일의 발전량)
            y = pd.concat(output_dfs)[target_col].values
            
            # 기록용 날짜 데이터 (예측 대상 날짜)
            dates = pd.concat(output_dfs)['datetime'].values
            
            X_data.append(X)
            y_data.append(y)
            dates_data.append(dates)
        
        if X_data:
            time_series_data[plant_id] = {
                'X': X_data,
                'y': y_data,
                'dates': dates_data
            }
    
    return time_series_data

def split_time_series_data(time_series_data, plants_info, external_test_ratio=0.2, train_ratio=0.7, valid_ratio=0.15):
    """
    시계열 데이터를 학습/검증/테스트 세트로 분할
    
    전략:
    1. 일부 발전소(데이터가 적은)를 external_test_plants로 분리 - 모델 학습에 전혀 사용하지 않음
    2. 나머지 발전소 데이터는 시간 순서대로 train/valid/test로 분할
    
    Args:
        time_series_data: 시계열 데이터 딕셔너리
        plants_info: 각 발전소 정보를 담은 딕셔너리
        external_test_ratio: 외부 테스트용 발전소 비율
        train_ratio: 학습 데이터 비율 (external test를 제외한 나머지에서)
        valid_ratio: 검증 데이터 비율 (external test를 제외한 나머지에서)
        
    Returns:
        split_data: 분할된 데이터 딕셔너리 (train, valid, test, external_test)
    """
    split_data = {
        'train': {'X': [], 'y': [], 'dates': [], 'plants': []},
        'valid': {'X': [], 'y': [], 'dates': [], 'plants': []},
        'test': {'X': [], 'y': [], 'dates': [], 'plants': []},
        'external_test': {'X': [], 'y': [], 'dates': [], 'plants': []}
    }
    
    # 데이터가 있는 발전소만 필터링
    available_plants = [plant_id for plant_id in time_series_data if len(time_series_data[plant_id]['X']) >= 3]
    
    # 발전소를 데이터 길이에 따라 정렬 (오름차순 - 데이터가 적은 순)
    sorted_plants = sorted(
        available_plants, 
        key=lambda p: plants_info[p]['data_length'] if 'data_length' in plants_info[p] else 0
    )
    
    # 외부 테스트용 발전소 선택 (데이터가 적은 발전소들)
    n_external_test = max(1, int(len(sorted_plants) * external_test_ratio))
    external_test_plants = sorted_plants[:n_external_test]
    train_valid_test_plants = sorted_plants[n_external_test:]
    
    print(f"External test plants: {external_test_plants}")
    print(f"Regular split plants: {train_valid_test_plants}")
    
    # 외부 테스트 발전소 데이터 분리
    for plant_id in external_test_plants:
        if plant_id not in time_series_data:
            continue
            
        data = time_series_data[plant_id]
        split_data['external_test']['X'].extend(data['X'])
        split_data['external_test']['y'].extend(data['y'])
        split_data['external_test']['dates'].extend(data['dates'])
        split_data['external_test']['plants'].extend([plant_id] * len(data['X']))
    
    # 나머지 발전소 데이터를 시간 순서대로 분할
    for plant_id in train_valid_test_plants:
        if plant_id not in time_series_data:
            continue
            
        data = time_series_data[plant_id]
        n_samples = len(data['X'])
        if n_samples < 3:  # 최소 3개 이상의 샘플이 필요
            continue
            
        # 인덱스 계산
        train_idx = int(n_samples * train_ratio)
        valid_idx = int(n_samples * (train_ratio + valid_ratio))
        
        # 데이터 분할 - 시간 순서대로 분할
        split_data['train']['X'].extend(data['X'][:train_idx])
        split_data['train']['y'].extend(data['y'][:train_idx])
        split_data['train']['dates'].extend(data['dates'][:train_idx])
        split_data['train']['plants'].extend([plant_id] * train_idx)
        
        split_data['valid']['X'].extend(data['X'][train_idx:valid_idx])
        split_data['valid']['y'].extend(data['y'][train_idx:valid_idx])
        split_data['valid']['dates'].extend(data['dates'][train_idx:valid_idx])
        split_data['valid']['plants'].extend([plant_id] * (valid_idx - train_idx))
        
        split_data['test']['X'].extend(data['X'][valid_idx:])
        split_data['test']['y'].extend(data['y'][valid_idx:])
        split_data['test']['dates'].extend(data['dates'][valid_idx:])
        split_data['test']['plants'].extend([plant_id] * (n_samples - valid_idx))
    
    # 배열로 변환
    for split_name in split_data:
        if split_data[split_name]['X']:  # 비어있지 않은 경우만 처리
            split_data[split_name]['X'] = np.array(split_data[split_name]['X'])
            split_data[split_name]['y'] = np.array(split_data[split_name]['y'])
            split_data[split_name]['dates'] = np.array(split_data[split_name]['dates'])
            # plants는 리스트로 유지 (식별자이므로 변환 불필요)
    
    return split_data

def save_time_series_data(split_data, output_dir='../data/modeling_data'):
    """
    시계열 데이터 저장
    
    Args:
        split_data: 분할된 시계열 데이터 딕셔너리
        output_dir: 저장할 디렉토리 경로
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for split_name, data in split_data.items():
        #if not data['X']:  # 빈 데이터 세트 건너뛰기
        #    continue
            
        split_dir = os.path.join(output_dir, split_name)
        os.makedirs(split_dir, exist_ok=True)
        
        # NumPy 배열로 저장
        np.save(os.path.join(split_dir, 'X.npy'), data['X'])
        np.save(os.path.join(split_dir, 'y.npy'), data['y'])
        np.save(os.path.join(split_dir, 'dates.npy'), data['dates'])
        np.save(os.path.join(split_dir, 'plants.npy'), np.array(data['plants']))


# Define paths
META_DATA_PATH = '../data/solar_energy/meta_data.csv'
data_dir = '../data/concat_data'
output_dir = '../data/modelnig_data'

# 윈도우 크기(일), 예측 대상 크기(일)
window_size = 1
prediction_size = 1

# 외부 테스트용 발전소 비율
external_test_ratio = 0.2

# Load and preprocess data
processed_data, feature_cols, target_col, plants_info = load_and_preprocess_data(META_DATA_PATH, data_dir)

# Print summary
print(f"Processed {len(processed_data)} plants")
print(f"Feature columns ({len(feature_cols)}): {feature_cols[:10]}...")
print(f"Target column: {target_col}")

# 시계열 데이터 준비 (전날 -> 당일 예측)
time_series_data = prepare_time_series_data(
    processed_data, 
    feature_cols, 
    target_col=target_col,
    window_size=window_size,
    prediction_size=prediction_size
)

print(f"Prepared time series data for {len(time_series_data)} plants")
total_samples = sum(len(data['X']) for plant_id, data in time_series_data.items())
print(f"Total samples: {total_samples}")

# 각 발전소별 샘플 수 출력
for plant_id, data in time_series_data.items():
    print(f"Plant {plant_id}: {len(data['X'])} samples")
    if len(data['X']) > 0:
        print(f"  X shape: {data['X'][0].shape}, y shape: {data['y'][0].shape}")

# 시계열 데이터 분할 (학습/검증/테스트 + 외부 테스트)
split_data = split_time_series_data(
    time_series_data, 
    plants_info,
    external_test_ratio=external_test_ratio
)

# 결과 요약 출력
for split_name, data in split_data.items():
    if len(data['X']) > 0:
        print(f"\n{split_name.upper()} SET:")
        print(f"  Samples: {len(data['X'])}")
        print(f"  X shape: {data['X'].shape}")
        print(f"  y shape: {data['y'].shape}")
        
        # 발전소 정보
        plant_counts = {}
        for plant_id in data['plants']:
            if plant_id in plant_counts:
                plant_counts[plant_id] += 1
            else:
                plant_counts[plant_id] = 1
        
        print(f"  Plants: {len(plant_counts)}")
        print(f"  Plant distribution: {plant_counts}")

# 시계열 데이터 저장
save_time_series_data(split_data, output_dir)
print(f"\nTime series data saved to {output_dir}")


Collecting plant information...


100%|██████████| 46/46 [00:00<00:00, 60.68it/s]


Found 46 plants
Processing each plant's data...


100%|██████████| 46/46 [00:05<00:00,  7.99it/s]


Processed 46 plants
Feature columns (40): ['총량(kw)', '평균(kw)', '최대(kw)', '최소(kw)', '최대(시간별_kw)', '최소(시간별_kw)', 'temperature', 'humidity', 'rn', 'ws']...
Target column: value
Plant 익산 다송리 has insufficient data: 0 days
Prepared time series data for 45 plants
Total samples: 36068
Plant 부산운동장: 1825 samples
  X shape: (24, 40), y shape: (24,)
Plant 화촌주민참여형: 517 samples
  X shape: (24, 40), y shape: (24,)
Plant 삼척소내_2: 882 samples
  X shape: (24, 40), y shape: (24,)
Plant 영월본부: 1460 samples
  X shape: (24, 40), y shape: (24,)
Plant 하동변전소: 517 samples
  X shape: (24, 40), y shape: (24,)
Plant 신인천 북측부지: 756 samples
  X shape: (24, 40), y shape: (24,)
Plant 삼척소내_3: 882 samples
  X shape: (24, 40), y shape: (24,)
Plant 와산리: 107 samples
  X shape: (24, 40), y shape: (24,)
Plant 하동본부_1: 517 samples
  X shape: (24, 40), y shape: (24,)
Plant 하동정수장: 517 samples
  X shape: (24, 40), y shape: (24,)
Plant 인천수산정수장: 882 samples
  X shape: (24, 40), y shape: (24,)
Plant 삼척소내_1: 882 samples
  X shape: (24, 

In [77]:
split_data['external_test']['X']

array([[[1320.215, 55.009, 389.059, ..., 18.683333333333334,
         0.9396926207859083, 0.3420201433256688],
        [1320.215, 55.009, 389.059, ..., 17.683333333333334, 1.0,
         6.123233995736766e-17],
        [1320.215, 55.009, 389.059, ..., 16.683333333333334, 1.0,
         6.123233995736766e-17],
        ...,
        [1320.215, 55.009, 389.059, ..., 0.0, 0.9396926207859084,
         -0.3420201433256687],
        [1320.215, 55.009, 389.059, ..., 0.0, 1.0,
         6.123233995736766e-17],
        [1320.215, 55.009, 389.059, ..., 0.0, 1.0,
         6.123233995736766e-17]],

       [[2438.64, 101.61, 445.68, ..., 18.666666666666668,
         0.9396926207859084, -0.3420201433256687],
        [2438.64, 101.61, 445.68, ..., 17.666666666666668,
         0.9396926207859084, -0.3420201433256687],
        [2438.64, 101.61, 445.68, ..., 16.666666666666668,
         0.9396926207859084, -0.3420201433256687],
        ...,
        [2438.64, 101.61, 445.68, ..., 0.0, -0.9396926207859085,
   

### 결측률 확인

- 결측률>50%인 rn (강수량), icsr (일사량)의 Nan은 그냥 0으로 두고 돌리기로 함 
- 나머지는 imputation

In [None]:
weather_columns = ['temperature', 'humidity', 'rn', 'ws',
       'wd', 'pv', 'pa', 'ps', 'ss', 'icsr', 'dc10Tca', 'dc10LmcsCa', 'lcsCh',
       'vs', 'ts', 'sunrise', 'sunset']
air_columns = ['SO2', 'CO', 'O3', 'NO2', 'PM10',
       'PM25', '미세먼지', '초미세먼지']

total_weathers,total_airs = [],[]
for f in files:
    file = pd.read_parquet(os.path.join(final_data_path, f))
    total_weathers.append(file[weather_columns+['date','time']])
    total_airs.append(file[air_columns+['date','time']])

total_weathers = pd.concat(total_weathers, ignore_index=True).drop_duplicates()
total_airs = pd.concat(total_airs, ignore_index=True)   


In [60]:
weather_columns = ['temperature', 'humidity', 'rn', 'ws',
       'wd', 'pv', 'pa', 'ps', 'ss', 'icsr', 'dc10Tca', 'dc10LmcsCa', 'lcsCh',
       'vs', 'ts', 'sunrise', 'sunset']
len(total_weathers['wd'].value_counts())

17

In [50]:
# column별 결측률 확인 (rounded to 1 decimal place)
print("Weather data missing rates:")
missing_rates = (total_weathers.isna().mean() * 100).round(1)
print(missing_rates)

# 결측률>50%인 column print
print("결측률 >50%인 column:")
high_missing_cols = missing_rates[missing_rates > 50].index.tolist()
print(high_missing_cols)

print("\nAir data missing rates:")
missing_rates = (total_airs.isna().mean() * 100).round(1)
print(missing_rates)

# 결측률>50%인 column print
print("결측률 >50%인 column:")
print(missing_rates[missing_rates > 50].index.tolist())

Weather data missing rates:
temperature     0.0
humidity        0.0
rn             90.9
ws              0.1
wd              0.1
pv              0.0
pa              0.0
ps              0.0
ss             43.2
icsr           68.2
dc10Tca         3.9
dc10LmcsCa      2.0
lcsCh          48.9
vs              0.4
ts              0.0
sunrise         0.0
sunset          0.0
date            0.0
time            0.0
dtype: float64
결측률 >50%인 column:
['rn', 'icsr']

Air data missing rates:
SO2      3.8
CO       3.7
O3       3.4
NO2      4.0
PM10     5.6
PM25     6.1
미세먼지     0.0
초미세먼지    6.1
date     0.0
time     0.0
dtype: float64
결측률 >50%인 column:
[]
