In [2]:
import pandas as pd
import numpy as np


In [3]:
# Import the Dataset 
file_path = 'Power_outage_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset imported successfully!")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the file path.")
    exit()


Dataset imported successfully!


In [6]:
# Initial Data Inspection 
print("\nInitial Dataset Info")
print(df.info())
print("\nFirst 5 rows of the dataset:")
print(df.head(30))

# Clean and Standardize Column Names 
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print("\nStandardized columns:")
print(df.columns)

# Combine Date and Time Columns and Convert to Datetime
try:
    df['start_datetime'] = pd.to_datetime(df['date_event_began'] + ' ' + df['time_event_began'],errors='coerce')

    df['end_datetime'] = pd.to_datetime(df['date_of_restoration'] + ' ' + df['time_of_restoration'],errors='coerce')

    df = df.drop(columns=['date_event_began', 'time_event_began', 'date_of_restoration', 'time_of_restoration'])

    print("\nDates combined and converted to datetime objects.")
    print(df[['start_datetime', 'end_datetime']].head())

except KeyError as e:
    print(f"Error: One of the expected date/time columns was not found: {e}")
    print("Please check the column names in your CSV file and adjust the code.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during datetime conversion: {e}")
    exit()

# Calculate Outage Duration
df = df[df['end_datetime'] > df['start_datetime']].copy()

df['outage_duration_minutes'] = (df['end_datetime'] - df['start_datetime']).dt.total_seconds() / 60
print("\n'outage_duration_minutes' column created.")
print(df['outage_duration_minutes'].describe())

# Handle Missing Values 
df['number_of_customers_affected'] = df['number_of_customers_affected'].fillna(0)
print("\nFilled missing values in 'number_of_customers_affected' with 0.")

# Save the Cleaned Dataset
df.to_csv('cleaned_power_outage_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_power_outage_data.csv'.")
print("\n--- Preprocessing Complete ---")
print(f"Final dataset shape: {df.shape}")


Initial Dataset Info
<class 'pandas.core.frame.DataFrame'>
Index: 1504 entries, 0 to 1651
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   event_description             1504 non-null   object        
 1   year                          1504 non-null   int64         
 2   respondent                    1504 non-null   object        
 3   geographic_areas              1504 non-null   object        
 4   nerc_region                   1502 non-null   object        
 5   demand_loss_mw                1134 non-null   object        
 6   number_of_customers_affected  1504 non-null   object        
 7   tags                          1503 non-null   object        
 8   start_datetime                1504 non-null   datetime64[ns]
 9   end_datetime                  1504 non-null   datetime64[ns]
 10  outage_duration_minutes       1504 non-null   float64       
dtypes: datetime64