In [1]:
import pandas as pd
import numpy as np

# --- 1. Import the Dataset ---
# Assuming your dataset is a CSV file named 'power_outage_data.csv'
# Make sure this file is in the same directory as your script.

file_path = 'Power_outage_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset imported successfully!")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the file path.")
    # Exit the script or handle the error gracefully
    exit()

# --- 2. Initial Data Inspection ---
print("\n--- Initial Dataset Info ---")
print(df.info())
print("\nFirst 5 rows of the dataset:")
print(df.head())

# --- 3. Clean and Standardize Column Names ---
# This makes your code cleaner and avoids issues with spaces.
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print("\nStandardized columns:")
print(df.columns)

# --- 4. Combine Date and Time Columns and Convert to Datetime ---
# This is a critical step for a time-series analysis.
# Assuming the columns from your image are named 'date_event_began' etc.

try:
    # Combine date and time columns into a single datetime object
    df['start_datetime'] = pd.to_datetime(df['date_event_began'] + ' ' + df['time_event_began'])
    df['end_datetime'] = pd.to_datetime(df['date_of_restoration'] + ' ' + df['time_of_restoration'])
    
    # Drop the old, now redundant columns
    df = df.drop(columns=['date_event_began', 'time_event_began', 'date_of_restoration', 'time_of_restoration'])

    print("\nDates combined and converted to datetime objects.")
    print(df[['start_datetime', 'end_datetime']].head())

except KeyError as e:
    print(f"Error: One of the expected date/time columns was not found: {e}")
    print("Please check the column names in your CSV file and adjust the code.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during datetime conversion: {e}")
    exit()

# --- 5. Calculate Outage Duration (Your Target Variable) ---
# Calculate the duration in minutes for convenience.

# Filter out rows where end_datetime is before start_datetime, as this is a data error
df = df[df['end_datetime'] > df['start_datetime']].copy()

df['outage_duration_minutes'] = (df['end_datetime'] - df['start_datetime']).dt.total_seconds() / 60
print("\n'outage_duration_minutes' column created.")
print(df['outage_duration_minutes'].describe())

# --- 6. Handle Missing Values (Example for customers_affected) ---
# Let's say we have missing values in 'number_of_customers_affected'.
# You can fill them with 0, as it's a reasonable assumption for a missing count.
df['number_of_customers_affected'] = df['number_of_customers_affected'].fillna(0)
print("\nFilled missing values in 'number_of_customers_affected' with 0.")

# --- 7. Save the Cleaned Dataset ---
# You might want to save the preprocessed data to a new file to avoid re-running the above code.
df.to_csv('cleaned_power_outage_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_power_outage_data.csv'.")
print("\n--- Preprocessing Complete ---")
print(f"Final dataset shape: {df.shape}")

Dataset imported successfully!

--- Initial Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Event Description             1652 non-null   object
 1   Year                          1652 non-null   int64 
 2   Date Event Began              1652 non-null   object
 3   Time Event Began              1643 non-null   object
 4   Date of Restoration           1638 non-null   object
 5   Time of Restoration           1632 non-null   object
 6   Respondent                    1652 non-null   object
 7   Geographic Areas              1651 non-null   object
 8   NERC Region                   1650 non-null   object
 9   Demand Loss (MW)              1246 non-null   object
 10  Number of Customers Affected  1434 non-null   object
 11  Tags                          1651 non-null   object
dtypes: int64(1), ob

  df['start_datetime'] = pd.to_datetime(df['date_event_began'] + ' ' + df['time_event_began'])


KeyError: 'end_datetime'