In [2]:
import pandas as pd
import numpy as np
import warnings

# [Previous loading code remains the same]
# ...
try:
    df_noaa = pd.read_csv(noaa_file_path, low_memory=False)
    print("NOAA data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {noaa_file_path}")
    exit()
except Exception as e:
    print(f"Error loading NOAA data: {e}")
    exit()

print("Performing initial cleaning and type conversions...")

# --- Datetime Conversion ---
noaa_datetime_format = '%d-%b-%y %H:%M:%S'
df_noaa['BEGIN_DT'] = pd.to_datetime(df_noaa['BEGIN_DATE_TIME'], format=noaa_datetime_format, errors='coerce')
df_noaa['END_DT'] = pd.to_datetime(df_noaa['END_DATE_TIME'], format=noaa_datetime_format, errors='coerce')

begin_nat_count = df_noaa['BEGIN_DT'].isna().sum()
end_nat_count = df_noaa['END_DT'].isna().sum()
# ... (rest of initial datetime parsing checks) ...

# --- *** INSPECT TIMEZONES *** ---
print("\n--- Unique Timezones Found in CZ_TIMEZONE ---")
unique_timezones = df_noaa['CZ_TIMEZONE'].unique()
print(unique_timezones)
print(f"Number of unique timezone entries: {len(unique_timezones)}")
# --- *** END INSPECTION *** ---


NOAA data loaded successfully.
Performing initial cleaning and type conversions...

--- Unique Timezones Found in CZ_TIMEZONE ---
['EST-5' 'CST-6' 'PST-8' 'MST-7' 'HST-10' 'AKST-9' 'AST-4' 'GST10'
 'SST-11' 'PDT-7' 'CDT-5' 'EDT-4']
Number of unique timezone entries: 12


In [6]:
import pandas as pd
import numpy as np
import warnings

# Define the path to the NOAA data file
noaa_file_path = 'data/NOAA_StormEvents/StormEvents_2014_2024.csv'

# Define columns that are likely numeric but might have issues during load
numeric_cols_to_check = [
    'BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH', 'END_DAY', 'END_TIME',
    'EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'CZ_FIPS',
    'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
    'MAGNITUDE', 'TOR_LENGTH', 'TOR_WIDTH',
    'BEGIN_RANGE', 'END_RANGE', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'
]

# Define the expected datetime format
# Using errors='coerce' will turn unparseable dates into NaT (Not a Time)
noaa_datetime_format = '%d-%b-%y %H:%M:%S'

print(f"Loading NOAA data from: {noaa_file_path}")

# --- Load the data ---
# Consider using low_memory=False if dtype warnings appear, or specify dtypes more precisely
# For very large files, consider chunking or libraries like Dask/Polars
try:
    df_noaa = pd.read_csv(noaa_file_path, low_memory=False)
    print("NOAA data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {noaa_file_path}")
    # Exit or handle error appropriately
    exit()
except Exception as e:
    print(f"Error loading NOAA data: {e}")
    exit()

print("Performing initial cleaning and type conversions...")

# --- Datetime Conversion ---
# Convert BEGIN_DATE_TIME and END_DATE_TIME
# errors='coerce' handles unparseable formats by setting them to NaT
df_noaa['BEGIN_DT'] = pd.to_datetime(df_noaa['BEGIN_DATE_TIME'], format=noaa_datetime_format, errors='coerce')
df_noaa['END_DT'] = pd.to_datetime(df_noaa['END_DATE_TIME'], format=noaa_datetime_format, errors='coerce')

# Check for parsing errors (NaT values)
begin_nat_count = df_noaa['BEGIN_DT'].isna().sum()
end_nat_count = df_noaa['END_DT'].isna().sum()
if begin_nat_count > 0 or end_nat_count > 0:
    print(f"Warning: Found {begin_nat_count} NaT values in BEGIN_DT after parsing.")
    print(f"Warning: Found {end_nat_count} NaT values in END_DT after parsing.")
    # Consider dropping or investigating rows with NaT datetimes if they are critical
    # df_noaa.dropna(subset=['BEGIN_DT', 'END_DT'], inplace=True)


# --- Timezone Handling ---
# Map common timezone abbreviations to standard Olson names usable by pandas
# This might need expansion based on unique values in CZ_TIMEZONE
tz_map = {
    # Standard US Timezones (using Olson names that handle DST)
    'EST-5': 'America/New_York',    # Eastern Time
    'EDT-4': 'America/New_York',    # Eastern Time (Daylight)
    'CST-6': 'America/Chicago',     # Central Time
    'CDT-5': 'America/Chicago',     # Central Time (Daylight)
    'MST-7': 'America/Denver',      # Mountain Time (most areas)
    'MDT-6': 'America/Denver',      # Mountain Time (most areas - Daylight) - Added MDT just in case although not in list
    'PST-8': 'America/Los_Angeles', # Pacific Time
    'PDT-7': 'America/Los_Angeles', # Pacific Time (Daylight)
    'AKST-9': 'America/Anchorage',  # Alaska Time
    'AKDT-8': 'America/Anchorage',  # Alaska Time (Daylight) - Added AKDT just in case
    'HST-10': 'Pacific/Honolulu',   # Hawaii Standard Time (no DST)

    # Atlantic & Territories
    'AST-4': 'America/Puerto_Rico', # Atlantic Standard Time (no DST in PR)
    'GST10': 'Pacific/Guam',        # Guam Standard Time (UTC+10)
    'SST-11': 'Pacific/Pago_Pago',   # Samoa Standard Time (UTC-11)

    # Add mappings for any potential NaN or empty strings if they exist
    '': None, # Map empty string explicitly if needed
    # np.nan: None # pd.isna() check in function should handle actual NaN objects
}

# Function to apply timezone localization
def localize_datetime(row):
    tz_str = row['CZ_TIMEZONE']
    dt = row['datetime_col']
    if pd.isna(dt) or pd.isna(tz_str):
        return pd.NaT

    tz_name = tz_map.get(tz_str)
    if tz_name:
        try:
            # Localize the naive datetime
            return dt.tz_localize(tz_name, ambiguous='NaT', nonexistent='NaT')
        except Exception as e:
            # Log warning for specific row/error if needed
            # warnings.warn(f"Could not localize timezone '{tz_str}' for datetime {dt}: {e}")
            return pd.NaT # Failed to localize
    else:
        # Log warning for unmapped timezone if needed
        # warnings.warn(f"Timezone '{tz_str}' not found in tz_map.")
        return pd.NaT # Timezone not in map

# Apply localization - requires iterating or a more complex apply
# Create temporary column for the function
df_noaa['datetime_col'] = df_noaa['BEGIN_DT']
df_noaa['BEGIN_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

df_noaa['datetime_col'] = df_noaa['END_DT']
df_noaa['END_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

unique_dt = df_noaa['BEGIN_DT_LOC'].unique()

print(unique_dt)

Loading NOAA data from: data/NOAA_StormEvents/StormEvents_2014_2024.csv
NOAA data loaded successfully.
Performing initial cleaning and type conversions...
[Timestamp('2014-02-18 10:00:00-0500', tz='America/New_York')
 Timestamp('2014-03-30 08:31:00-0400', tz='America/New_York')
 Timestamp('2014-04-27 23:06:00-0500', tz='America/Chicago') ...
 Timestamp('2024-05-09 12:53:00-0500', tz='America/Chicago')
 Timestamp('2024-05-22 18:09:00-0400', tz='America/New_York')
 Timestamp('2024-08-06 07:52:00-0400', tz='America/New_York')]


In [8]:
df_noaa['BEGIN_DT_LOC']

0         2014-02-18 10:00:00-05:00
1         2014-03-30 08:31:00-04:00
2         2014-04-27 23:06:00-05:00
3         2014-04-27 23:03:00-05:00
4         2014-02-15 13:00:00-08:00
                    ...            
691429    2024-05-26 11:48:00-04:00
691430    2024-05-22 18:09:00-04:00
691431    2024-05-22 17:57:00-04:00
691432    2024-06-23 17:45:00-04:00
691433    2024-08-06 07:52:00-04:00
Name: BEGIN_DT_LOC, Length: 691434, dtype: object

In [4]:
import pandas as pd
import numpy as np
import warnings

# Define the path to the NOAA data file
noaa_file_path = 'data/NOAA_StormEvents/StormEvents_2014_2024.csv'

# Define columns that are likely numeric but might have issues during load
numeric_cols_to_check = [
    'BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH', 'END_DAY', 'END_TIME',
    'EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'CZ_FIPS',
    'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
    'MAGNITUDE', 'TOR_LENGTH', 'TOR_WIDTH',
    'BEGIN_RANGE', 'END_RANGE', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'
]

# Define the expected datetime format
# Using errors='coerce' will turn unparseable dates into NaT (Not a Time)
noaa_datetime_format = '%d-%b-%y %H:%M:%S'

print(f"Loading NOAA data from: {noaa_file_path}")

# --- Load the data ---
# Consider using low_memory=False if dtype warnings appear, or specify dtypes more precisely
# For very large files, consider chunking or libraries like Dask/Polars
try:
    df_noaa = pd.read_csv(noaa_file_path, low_memory=False)
    print("NOAA data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {noaa_file_path}")
    # Exit or handle error appropriately
    exit()
except Exception as e:
    print(f"Error loading NOAA data: {e}")
    exit()

print("Performing initial cleaning and type conversions...")

# --- Datetime Conversion ---
# Convert BEGIN_DATE_TIME and END_DATE_TIME
# errors='coerce' handles unparseable formats by setting them to NaT
df_noaa['BEGIN_DT'] = pd.to_datetime(df_noaa['BEGIN_DATE_TIME'], format=noaa_datetime_format, errors='coerce')
df_noaa['END_DT'] = pd.to_datetime(df_noaa['END_DATE_TIME'], format=noaa_datetime_format, errors='coerce')

# Check for parsing errors (NaT values)
begin_nat_count = df_noaa['BEGIN_DT'].isna().sum()
end_nat_count = df_noaa['END_DT'].isna().sum()
if begin_nat_count > 0 or end_nat_count > 0:
    print(f"Warning: Found {begin_nat_count} NaT values in BEGIN_DT after parsing.")
    print(f"Warning: Found {end_nat_count} NaT values in END_DT after parsing.")
    # Consider dropping or investigating rows with NaT datetimes if they are critical
    # df_noaa.dropna(subset=['BEGIN_DT', 'END_DT'], inplace=True)


# --- Timezone Handling ---
# Map common timezone abbreviations to standard Olson names usable by pandas
# This might need expansion based on unique values in CZ_TIMEZONE
tz_map = {
    # Standard US Timezones (using Olson names that handle DST)
    'EST-5': 'America/New_York',    # Eastern Time
    'EDT-4': 'America/New_York',    # Eastern Time (Daylight)
    'CST-6': 'America/Chicago',     # Central Time
    'CDT-5': 'America/Chicago',     # Central Time (Daylight)
    'MST-7': 'America/Denver',      # Mountain Time (most areas)
    'MDT-6': 'America/Denver',      # Mountain Time (most areas - Daylight) - Added MDT just in case although not in list
    'PST-8': 'America/Los_Angeles', # Pacific Time
    'PDT-7': 'America/Los_Angeles', # Pacific Time (Daylight)
    'AKST-9': 'America/Anchorage',  # Alaska Time
    'AKDT-8': 'America/Anchorage',  # Alaska Time (Daylight) - Added AKDT just in case
    'HST-10': 'Pacific/Honolulu',   # Hawaii Standard Time (no DST)

    # Atlantic & Territories
    'AST-4': 'America/Puerto_Rico', # Atlantic Standard Time (no DST in PR)
    'GST10': 'Pacific/Guam',        # Guam Standard Time (UTC+10)
    'SST-11': 'Pacific/Pago_Pago',   # Samoa Standard Time (UTC-11)

    # Add mappings for any potential NaN or empty strings if they exist
    '': None, # Map empty string explicitly if needed
    # np.nan: None # pd.isna() check in function should handle actual NaN objects
}

# Function to apply timezone localization
def localize_datetime(row):
    tz_str = row['CZ_TIMEZONE']
    dt = row['datetime_col']
    if pd.isna(dt) or pd.isna(tz_str):
        return pd.NaT

    tz_name = tz_map.get(tz_str)
    if tz_name:
        try:
            # Localize the naive datetime
            return dt.tz_localize(tz_name, ambiguous='NaT', nonexistent='NaT')
        except Exception as e:
            # Log warning for specific row/error if needed
            # warnings.warn(f"Could not localize timezone '{tz_str}' for datetime {dt}: {e}")
            return pd.NaT # Failed to localize
    else:
        # Log warning for unmapped timezone if needed
        # warnings.warn(f"Timezone '{tz_str}' not found in tz_map.")
        return pd.NaT # Timezone not in map

# Apply localization - requires iterating or a more complex apply
# Create temporary column for the function
df_noaa['datetime_col'] = df_noaa['BEGIN_DT']
df_noaa['BEGIN_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

df_noaa['datetime_col'] = df_noaa['END_DT']
df_noaa['END_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

df_noaa.drop(columns=['datetime_col'], inplace=True) # remove temporary column

# Convert localized datetimes to UTC
df_noaa['BEGIN_DT_UTC'] = df_noaa['BEGIN_DT_LOC'].dt.tz_convert('UTC')
df_noaa['END_DT_UTC'] = df_noaa['END_DT_LOC'].dt.tz_convert('UTC')

# Check for NaTs introduced during timezone conversion
begin_tz_nat_count = df_noaa['BEGIN_DT_UTC'].isna().sum()
end_tz_nat_count = df_noaa['END_DT_UTC'].isna().sum()
if begin_tz_nat_count > begin_nat_count or end_tz_nat_count > end_nat_count:
     print(f"Warning: {begin_tz_nat_count - begin_nat_count} additional NaTs in BEGIN_DT_UTC after timezone conversion.")
     print(f"Warning: {end_tz_nat_count - end_nat_count} additional NaTs in END_DT_UTC after timezone conversion.")
     # Consider dropping rows where timezone conversion failed if UTC time is essential
     # df_noaa.dropna(subset=['BEGIN_DT_UTC', 'END_DT_UTC'], inplace=True)


# --- Numeric Conversions ---
for col in numeric_cols_to_check:
    if col in df_noaa.columns:
        # Convert to numeric, coercing errors to NaN
        df_noaa[col] = pd.to_numeric(df_noaa[col], errors='coerce')
        # Optionally fill NaNs created by coercion if appropriate (e.g., injuries/deaths)
        if 'INJURIES' in col or 'DEATHS' in col:
            df_noaa[col].fillna(0, inplace=True)


# --- FIPS Code to String ---
# Convert FIPS codes to string for future consistent merging
# Padding will be handled later when merging with Eaglei
if 'STATE_FIPS' in df_noaa.columns:
    df_noaa['STATE_FIPS'] = df_noaa['STATE_FIPS'].astype(str)
if 'CZ_FIPS' in df_noaa.columns:
    df_noaa['CZ_FIPS'] = df_noaa['CZ_FIPS'].astype(str)


# --- Display Info and Head ---
print("\n--- NOAA DataFrame Info after initial cleaning ---")
df_noaa.info(verbose=True, show_counts=True)

print("\n--- NOAA DataFrame Head ---")
print(df_noaa[['EVENT_ID', 'STATE', 'CZ_TYPE', 'CZ_FIPS', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'BEGIN_DT', 'BEGIN_DT_LOC', 'BEGIN_DT_UTC']].head())

# --- Optional: Check unique timezones ---
# print("\n--- Unique Timezones Found ---")
# print(df_noaa['CZ_TIMEZONE'].value_counts())

print("\nStep 1 (NOAA Load/Clean) Complete.")
# df_noaa now contains the loaded and initially cleaned NOAA data
# Key new columns: BEGIN_DT, END_DT (original parsed),
#                  BEGIN_DT_LOC, END_DT_LOC (localized),
#                  BEGIN_DT_UTC, END_DT_UTC (converted to UTC)


Loading NOAA data from: data/NOAA_StormEvents/StormEvents_2014_2024.csv
NOAA data loaded successfully.
Performing initial cleaning and type conversions...


AttributeError: Can only use .dt accessor with datetimelike values

In [9]:
import pandas as pd
import numpy as np
import warnings

# Define the path to the NOAA data file
noaa_file_path = 'data/NOAA_StormEvents/StormEvents_2014_2024.csv'

# Define columns that are likely numeric but might have issues during load
numeric_cols_to_check = [
    'BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH', 'END_DAY', 'END_TIME',
    'EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'CZ_FIPS',
    'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
    'MAGNITUDE', 'TOR_LENGTH', 'TOR_WIDTH',
    'BEGIN_RANGE', 'END_RANGE', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'
]

# Define the expected datetime format
# Using errors='coerce' will turn unparseable dates into NaT (Not a Time)
noaa_datetime_format = '%d-%b-%y %H:%M:%S'

print(f"Loading NOAA data from: {noaa_file_path}")

# --- Load the data ---
try:
    df_noaa = pd.read_csv(noaa_file_path, low_memory=False)
    print("NOAA data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {noaa_file_path}")
    exit()
except Exception as e:
    print(f"Error loading NOAA data: {e}")
    exit()

print("Performing initial cleaning and type conversions...")

# --- Datetime Conversion (Initial Parsing) ---
print("Parsing original datetime strings...")
df_noaa['BEGIN_DT'] = pd.to_datetime(df_noaa['BEGIN_DATE_TIME'], format=noaa_datetime_format, errors='coerce')
df_noaa['END_DT'] = pd.to_datetime(df_noaa['END_DATE_TIME'], format=noaa_datetime_format, errors='coerce')

# Check for initial parsing errors (NaT values)
begin_nat_count = df_noaa['BEGIN_DT'].isna().sum()
end_nat_count = df_noaa['END_DT'].isna().sum()
if begin_nat_count > 0 or end_nat_count > 0:
    print(f"Warning: Found {begin_nat_count} NaT values in BEGIN_DT after initial parsing.")
    print(f"Warning: Found {end_nat_count} NaT values in END_DT after initial parsing.")

# --- Timezone Handling ---
print("Mapping timezones...")
# Using the tz_map confirmed from your data
tz_map = {
    'EST-5': 'America/New_York',
    'EDT-4': 'America/New_York',
    'CST-6': 'America/Chicago',
    'CDT-5': 'America/Chicago',
    'MST-7': 'America/Denver',
    'MDT-6': 'America/Denver', # Keep just in case
    'PST-8': 'America/Los_Angeles',
    'PDT-7': 'America/Los_Angeles',
    'AKST-9': 'America/Anchorage',
    'AKDT-8': 'America/Anchorage', # Keep just in case
    'HST-10': 'Pacific/Honolulu',
    'AST-4': 'America/Puerto_Rico',
    'GST10': 'Pacific/Guam',
    'SST-11': 'Pacific/Pago_Pago',
    '': None, # Map empty string explicitly if needed
}

# Function to apply timezone localization
def localize_datetime(row):
    tz_str = row['CZ_TIMEZONE']
    dt = row['datetime_col']
    if pd.isna(dt) or pd.isna(tz_str):
        return pd.NaT

    # Ensure tz_str is string and strip whitespace for lookup
    tz_str = str(tz_str).strip()
    tz_name = tz_map.get(tz_str)

    if tz_name:
        try:
            return dt.tz_localize(tz_name, ambiguous='NaT', nonexistent='NaT')
        except Exception as e:
            # warnings.warn(f"Could not localize timezone '{tz_str}' for datetime {dt}: {e}")
            return pd.NaT
    else:
        if tz_str: # Avoid warning for known blanks mapped to None
             warnings.warn(f"Timezone '{tz_str}' not found in tz_map.", UserWarning)
        return pd.NaT

# Apply localization using the helper column
print("Applying timezone localization (this may take time)...")
df_noaa['datetime_col'] = df_noaa['BEGIN_DT']
df_noaa['BEGIN_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

df_noaa['datetime_col'] = df_noaa['END_DT']
df_noaa['END_DT_LOC'] = df_noaa.apply(localize_datetime, axis=1)

df_noaa.drop(columns=['datetime_col'], inplace=True)
print("Timezone localization applied.")

# --- Convert Mixed Timezone 'Object' Columns Directly to UTC ---
# This step addresses the 'dtype: object' issue after localization
print("Attempting to convert localized columns directly to UTC...")
try:
    # Record NaNs before conversion
    original_loc_nan_begin = df_noaa['BEGIN_DT_LOC'].isna().sum()
    original_loc_nan_end = df_noaa['END_DT_LOC'].isna().sum()

    # Use pd.to_datetime with utc=True to handle the object column containing tz-aware objects
    df_noaa['BEGIN_DT_UTC'] = pd.to_datetime(df_noaa['BEGIN_DT_LOC'], errors='coerce', utc=True)
    df_noaa['END_DT_UTC'] = pd.to_datetime(df_noaa['END_DT_LOC'], errors='coerce', utc=True)

    print("Direct UTC conversion attempted.")

    # Check for new NaNs potentially introduced
    begin_utc_nat_count = df_noaa['BEGIN_DT_UTC'].isna().sum()
    end_utc_nat_count = df_noaa['END_DT_UTC'].isna().sum()
    if begin_utc_nat_count > original_loc_nan_begin or end_utc_nat_count > original_loc_nan_end:
         print(f"Warning: Additional NaTs potentially introduced during UTC conversion.")
         print(f"         ({original_loc_nan_begin} -> {begin_utc_nat_count} NaTs in BEGIN_DT_UTC)")
         print(f"         ({original_loc_nan_end} -> {end_utc_nat_count} NaTs in END_DT_UTC)")

except Exception as e:
    print(f"ERROR: Failed to convert localized object columns to UTC. Error: {e}")
    # Assign NaT if conversion fails catastrophically
    df_noaa['BEGIN_DT_UTC'] = pd.NaT
    df_noaa['END_DT_UTC'] = pd.NaT


# --- Numeric Conversions ---
print("Converting numeric columns...")
for col in numeric_cols_to_check:
    if col in df_noaa.columns:
        df_noaa[col] = pd.to_numeric(df_noaa[col], errors='coerce')
        if 'INJURIES' in col or 'DEATHS' in col:
            # Fill NaNs only for specific columns where 0 makes sense
            df_noaa[col].fillna(0, inplace=True)


# --- FIPS Code to String ---
print("Converting FIPS codes to string...")
if 'STATE_FIPS' in df_noaa.columns:
    # Use .astype(str).str.split('.').str[0] to handle potential floats before converting to string
    df_noaa['STATE_FIPS'] = df_noaa['STATE_FIPS'].astype(str).str.split('.').str[0]
if 'CZ_FIPS' in df_noaa.columns:
    df_noaa['CZ_FIPS'] = df_noaa['CZ_FIPS'].astype(str).str.split('.').str[0]


# --- Display Info and Head ---
print("\n--- NOAA DataFrame Info after initial cleaning ---")
df_noaa.info(verbose=True, show_counts=True)

print("\n--- NOAA DataFrame Head (focus on datetimes) ---")
print(df_noaa[['EVENT_ID', 'CZ_TIMEZONE', 'BEGIN_DT', 'BEGIN_DT_LOC', 'BEGIN_DT_UTC', 'END_DT_UTC']].head())

print("\n--- Data Types of Final Datetime Columns ---")
print(df_noaa[['BEGIN_DT_UTC', 'END_DT_UTC']].dtypes)

print("\nStep 1 (NOAA Load/Clean) Complete.")
# df_noaa now contains the loaded and initially cleaned NOAA data
# Key final columns: BEGIN_DT_UTC, END_DT_UTC (should be datetime64[ns, UTC])

Loading NOAA data from: data/NOAA_StormEvents/StormEvents_2014_2024.csv
NOAA data loaded successfully.
Performing initial cleaning and type conversions...
Parsing original datetime strings...
Mapping timezones...
Applying timezone localization (this may take time)...
Timezone localization applied.
Attempting to convert localized columns directly to UTC...
Direct UTC conversion attempted.
Converting numeric columns...
Converting FIPS codes to string...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_noaa[col].fillna(0, inplace=True)



--- NOAA DataFrame Info after initial cleaning ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691434 entries, 0 to 691433
Data columns (total 57 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   BEGIN_YEARMONTH     691434 non-null  int64              
 1   BEGIN_DAY           691434 non-null  int64              
 2   BEGIN_TIME          691434 non-null  int64              
 3   END_YEARMONTH       691434 non-null  int64              
 4   END_DAY             691434 non-null  int64              
 5   END_TIME            691434 non-null  int64              
 6   EPISODE_ID          691434 non-null  int64              
 7   EVENT_ID            691434 non-null  int64              
 8   STATE               691434 non-null  object             
 9   STATE_FIPS          691434 non-null  object             
 10  YEAR                691434 non-null  int64              
 11  MONTH_NAME          691434

In [10]:
df_noaa.to_csv("NOAA_timezone_cleaned.csv",index=False)

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE,BEGIN_DT,END_DT,BEGIN_DT_LOC,END_DT_LOC,BEGIN_DT_UTC,END_DT_UTC
0,201402,18,1000,201402,18,2000,83473,503953,NEW HAMPSHIRE,33,...,,Low pressure developing south of Long Island a...,Eight to twelve inches of snow fell across eas...,CSV,2014-02-18 10:00:00,2014-02-18 20:00:00,2014-02-18 10:00:00-05:00,2014-02-18 20:00:00-05:00,2014-02-18 15:00:00+00:00,2014-02-19 01:00:00+00:00
1,201403,30,831,201403,30,931,83971,507163,MASSACHUSETTS,25,...,-71.3469,A stacked low pressure system passed south and...,Boston Road was closed near Brian Road due to ...,CSV,2014-03-30 08:31:00,2014-03-30 09:31:00,2014-03-30 08:31:00-04:00,2014-03-30 09:31:00-04:00,2014-03-30 12:31:00+00:00,2014-03-30 13:31:00+00:00
2,201404,27,2306,201404,27,2306,83517,506236,MISSOURI,29,...,-92.6600,A powerful storm system and a dry line produce...,,CSV,2014-04-27 23:06:00,2014-04-27 23:06:00,2014-04-27 23:06:00-05:00,2014-04-27 23:06:00-05:00,2014-04-28 04:06:00+00:00,2014-04-28 04:06:00+00:00
3,201404,27,2303,201404,27,2303,83517,506237,MISSOURI,29,...,-92.6600,A powerful storm system and a dry line produce...,Several power poles snapped and trees blown down.,CSV,2014-04-27 23:03:00,2014-04-27 23:03:00,2014-04-27 23:03:00-05:00,2014-04-27 23:03:00-05:00,2014-04-28 04:03:00+00:00,2014-04-28 04:03:00+00:00
4,201402,15,1300,201402,15,2100,83132,501499,WASHINGTON,53,...,,A strong cold front produced strong winds for ...,Two stations measured strong wind gusts in the...,CSV,2014-02-15 13:00:00,2014-02-15 21:00:00,2014-02-15 13:00:00-08:00,2014-02-15 21:00:00-08:00,2014-02-15 21:00:00+00:00,2014-02-16 05:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691429,202405,26,1148,202405,26,1148,192532,1188957,KENTUCKY,21,...,-84.7200,A strong storm system moved across the Ohio an...,A trained spotter estimated 60 mph wind gusts ...,CSV,2024-05-26 11:48:00,2024-05-26 11:48:00,2024-05-26 11:48:00-04:00,2024-05-26 11:48:00-04:00,2024-05-26 15:48:00+00:00,2024-05-26 15:48:00+00:00
691430,202405,22,1809,202405,22,1809,192530,1188234,INDIANA,18,...,-85.7364,A cold front moved into the Ohio Valley during...,A tree was down at Lovers Lane and Prewitt Lane.,CSV,2024-05-22 18:09:00,2024-05-22 18:09:00,2024-05-22 18:09:00-04:00,2024-05-22 18:09:00-04:00,2024-05-22 22:09:00+00:00,2024-05-22 22:09:00+00:00
691431,202405,22,1757,202405,22,1757,192530,1188232,INDIANA,18,...,-86.7247,A cold front moved into the Ohio Valley during...,A tree was reported down over Chestnut Grove R...,CSV,2024-05-22 17:57:00,2024-05-22 17:57:00,2024-05-22 17:57:00-04:00,2024-05-22 17:57:00-04:00,2024-05-22 21:57:00+00:00,2024-05-22 21:57:00+00:00
691432,202406,23,1745,202406,23,1750,191388,1192879,NEW HAMPSHIRE,33,...,-70.8400,A supercell thunderstorm developed across sout...,A supercell thunderstorm dropped hail the size...,CSV,2024-06-23 17:45:00,2024-06-23 17:50:00,2024-06-23 17:45:00-04:00,2024-06-23 17:50:00-04:00,2024-06-23 21:45:00+00:00,2024-06-23 21:50:00+00:00
