In [2]:
import pandas as pd
import numpy as np

# Read the data (assuming it's in a CSV format, adjust accordingly)
df = pd.read_csv('Comprehensive_Earthquake_Dataset.csv')

# Preview the data
print(df.head())

# 1. Save the original 'Time of Occurrence' column before converting to datetime
df['Original Time of Occurrence'] = df['Time of Occurrence']

# 2. Convert the 'Time of Occurrence' column to datetime for analysis
df['Time of Occurrence'] = pd.to_datetime(df['Time of Occurrence'], format='%d-%m-%Y', errors='coerce')

# 3. Handling missing values:
# Replace missing numeric values with the median of the column
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# Replace missing categorical values with a placeholder, e.g., 'Unknown'
categorical_columns = df.select_dtypes(include=[object]).columns
df[categorical_columns] = df[categorical_columns].fillna('Unknown')

# 4. Convert the Latitude and Longitude columns to numeric, handling any errors
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

# 5. Convert 'Cascading Event' to a binary numeric value
df['Cascading Event'] = df['Cascading Event'].map({'Yes': 1, 'No': 0})

# 6. If 'Cascading Event Type' is a key feature, one-hot encode it
df = pd.get_dummies(df, columns=['Cascading Event Type'], prefix='CascadingEvent')

# 7. Check for any remaining NaNs and handle them
df = df.fillna(df.median())  # Replace remaining NaNs with the median

# 8. Data type conversion (if needed for specific columns)
df['Fatalities'] = df['Fatalities'].astype(int)
df['Building Failure Rate'] = pd.to_numeric(df['Building Failure Rate'], errors='coerce')

# 9. Ensure all numeric columns are properly formatted
df = df.apply(pd.to_numeric, errors='ignore')

# Final Check: Preview the processed data
print(df.info())  # Check data types and null values
print(df.head())  # View the first few rows of the processed data


   Event ID  Magnitude  Depth (km) Time of Occurrence   Latitude   Longitude  \
0         1        6.1          21         06-04-2023  33.996715 -118.345031   
1         2        7.9          15         02-08-2023  34.051518 -117.751207   
2         3        7.2          11         21-10-2023  33.796659 -118.695034   
3         4        6.8          28         28-07-2023  34.020960 -118.171458   
4         5        5.5          18         09-12-2023  34.211914 -118.667428   

0                            0.32                     35   
1                            0.33                     29   
2                            0.38                     48   
3                            0.30                     25   
4                            0.43                     48   

   EEWS False Alarm Rate Cascading Event Cascading Event Type  \
0                   0.08             Yes            Landslide   
1                   0.04              No                    -   
2                   0.0

TypeError: could not convert string to float: '06-04-2023'

In [4]:
# Create DataFrame

df.to_csv("cleaned123456.csv", index=False)
print("Dataset saved as 'cleaned123456.csv'")

Dataset saved as 'cleaned123456.csv'
