In [38]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [42]:
# df = pd.read_csv("train.csv")
# print(df.head())
df = pd.read_csv("train.csv", skipinitialspace=True, sep=',', na_values='?', 
                 parse_dates=['DateTime'], infer_datetime_format=True, 
                 low_memory=False)

df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce') 
df = df.set_index('DateTime')  # 设置为索引

# 将数值列转换为 float
cols_to_float = ['Global_active_power', 'Global_reactive_power', 'Voltage', 
                 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
df[cols_to_float] = df[cols_to_float].astype('float32')
print(df.columns.tolist())

weather_cols = ['RR','NBJRR1','NBJRR5','NBJRR10','NBJBROU']
df[weather_cols] = df[weather_cols].apply(pd.to_numeric, errors='coerce')


  df = pd.read_csv("train.csv", skipinitialspace=True, sep=',', na_values='?',


['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3', 'RR', 'NBJRR1', 'NBJRR5', 'NBJRR10', 'NBJBROU']


In [40]:
def fill_missing_values(df):
    filled = pd.DataFrame(index=df.index)

    for col in df.columns:
        daily_missing = df[col].isna().resample('D').sum()

        filled[col] = df[col]

        for date, missing_count in daily_missing.items():
            day_slice = df.loc[date:date + pd.Timedelta(days=1) - pd.Timedelta(minutes=1)]

            if missing_count >= 720:
                # 用过去同一天平均值填补
                same_day_mask = (df.index.month == date.month) & (df.index.day == date.day)
                reference_values = df.loc[same_day_mask, col].dropna()
                day_mean = reference_values.mean() if not reference_values.empty else 0
                filled.loc[day_slice.index, col] = day_mean
            else:
                # 同一天内线性插值
                filled.loc[day_slice.index, col] = day_slice[col].interpolate(method='linear')

    return filled

df = fill_missing_values(df)

In [43]:
df['sub_metering_remainder'] = (df['Global_active_power'] * 1000 / 60) - (
    df['Sub_metering_1'] + df['Sub_metering_2'] + df['Sub_metering_3'])


df.columns = [c.lower() for c in df.columns]

daily_df = pd.DataFrame()
daily_df['global_active_power'] = df['global_active_power'].resample('D').sum()
daily_df['global_reactive_power'] = df['global_reactive_power'].resample('D').sum()
daily_df['sub_metering_1'] = df['sub_metering_1'].resample('D').sum()
daily_df['sub_metering_2'] = df['sub_metering_2'].resample('D').sum()
daily_df['sub_metering_3'] = df['sub_metering_3'].resample('D').sum()
daily_df['sub_metering_remainder'] = df['sub_metering_remainder'].resample('D').sum()
daily_df['voltage'] = df['voltage'].resample('D').mean()
daily_df['global_intensity'] = df['global_intensity'].resample('D').mean()
daily_df['rr'] = df['rr'].resample('D').first() / 10.0
daily_df['nbjrr1'] = df['nbjrr1'].resample('D').first()
daily_df['nbjrr5'] = df['nbjrr5'].resample('D').first()
daily_df['nbjrr10'] = df['nbjrr10'].resample('D').first()
daily_df['nbjbrou'] = df['nbjbrou'].resample('D').first()

In [44]:
daily_df.to_csv("Processed_train.csv")

In [49]:
column_names = [
    "DateTime", "Global_active_power", "Global_reactive_power", "Voltage",
    "Global_intensity", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3",
    "RR", "NBJRR1", "NBJRR5", "NBJRR10", "NBJBROU"
]
test_df = pd.read_csv("test.csv", header=None, names=column_names, na_values='?', skipinitialspace=True)

test_df['DateTime'] = pd.to_datetime(test_df['DateTime'], errors='coerce')
test_df = test_df.set_index('DateTime')

test_df.columns = test_df.columns.str.strip().str.lower()
test_df = test_df.astype('float32')

weather_cols = ['rr', 'nbjrr1', 'nbjrr5', 'nbjrr10', 'nbjbrou']
test_df[weather_cols] = test_df[weather_cols].apply(pd.to_numeric, errors='coerce')

In [50]:

test_df['sub_metering_remainder'] = (test_df['global_active_power'] * 1000 / 60) - (
    test_df['sub_metering_1'] + test_df['sub_metering_2'] + test_df['sub_metering_3'])

test_daily_df = pd.DataFrame()
test_daily_df['global_active_power'] = test_df['global_active_power'].resample('D').sum()
test_daily_df['global_reactive_power'] = test_df['global_reactive_power'].resample('D').sum()
test_daily_df['sub_metering_1'] = test_df['sub_metering_1'].resample('D').sum()
test_daily_df['sub_metering_2'] = test_df['sub_metering_2'].resample('D').sum()
test_daily_df['sub_metering_3'] = test_df['sub_metering_3'].resample('D').sum()
test_daily_df['sub_metering_remainder'] = test_df['sub_metering_remainder'].resample('D').sum()
test_daily_df['voltage'] = test_df['voltage'].resample('D').mean()
test_daily_df['global_intensity'] = test_df['global_intensity'].resample('D').mean()
test_daily_df['rr'] = test_df['rr'].resample('D').first() / 10.0
test_daily_df['nbjrr1'] = test_df['nbjrr1'].resample('D').first()
test_daily_df['nbjrr5'] = test_df['nbjrr5'].resample('D').first()
test_daily_df['nbjrr10'] = test_df['nbjrr10'].resample('D').first()
test_daily_df['nbjbrou'] = test_df['nbjbrou'].resample('D').first()


In [51]:
test_daily_df.to_csv("Processed_test.csv")