In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
data = pd.read_csv('household_power_consumption.txt', sep=';', 
                   na_values=['nan', '?'], 
                   low_memory=False)

FileNotFoundError: [Errno 2] No such file or directory: 'household_power_consumption.txt'

In [None]:
# Combine 'Date' and 'Time' columns to create 'datetime' column
data['datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%d/%m/%Y %H:%M:%S')

# Drop the original 'Date' and 'Time' columns
data.drop(['Date', 'Time'], axis=1, inplace=True)

# Set 'datetime' as the index
data.set_index('datetime', inplace=True)

# Check for missing values
print(data.isna().sum())

In [None]:
data = data.ffill()

# Select relevant features
features = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 
            'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']

# Resample data to daily mean
daily_data = data[features].resample('D').mean()

# Display the resampled data
print(daily_data.head())

# Split data into train and test sets
train_size = int(len(daily_data) * 0.8)
train, test = daily_data[:train_size], daily_data[train_size:]

# Fit ARIMA model
model = ARIMA(train['Global_active_power'], order=(5, 1, 0))
model_fit = model.fit()

# Make predictions
predictions = model_fit.forecast(steps=len(test))

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(test.index, test['Global_active_power'], label='Actual')
plt.plot(test.index, predictions, label='Predicted', color='red')
plt.title('ARIMA Model Predictions')
plt.xlabel('Date')
plt.ylabel('Global Active Power (kilowatts)')
plt.legend()
plt.show()

In [None]:
mae = mean_absolute_error(test['Global_active_power'], predictions)
rmse = np.sqrt(mean_squared_error(test['Global_active_power'], predictions))

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
