In [1]:
import pandas as pd
import numpy as np

# Load the dataset 
file_path = 'Comprehensive_Earthquake_Dataset.csv'  
df = pd.read_csv(file_path)

In [2]:
print(df.head())

   Event ID  Magnitude  Depth (km) Time of Occurrence   Latitude   Longitude  \
0         1        6.1          21         06-04-2023  33.996715 -118.345031   
1         2        7.9          15         02-08-2023  34.051518 -117.751207   
2         3        7.2          11         21-10-2023  33.796659 -118.695034   
3         4        6.8          28         28-07-2023  34.020960 -118.171458   
4         5        5.5          18         09-12-2023  34.211914 -118.667428   

0                            0.32                     35   
1                            0.33                     29   
2                            0.38                     48   
3                            0.30                     25   
4                            0.43                     48   

   EEWS False Alarm Rate Cascading Event Cascading Event Type  \
0                   0.08             Yes            Landslide   
1                   0.04              No                    -   
2                   0.0

In [3]:
df_cleaned = df.dropna()

In [4]:
df_cleaned = df_cleaned.drop_duplicates()

In [5]:
# Filling missing numerical values with 0 or using forward fill/backward fill depending on the context
df['Building Failure Rate'] = df['Building Failure Rate'].fillna(0)
df['Landslide Probability'] = df['Landslide Probability'].fillna(0)
df['Infrastructure Damage Cost (in USD)'] = df['Infrastructure Damage Cost (in USD)'].fillna(0)
df['Fatalities'] = df['Fatalities'].fillna(0)


In [6]:
# If cascading event is 'No', 'Cascading Event Type' can be safely filled with 'None'
df['Cascading Event Type'] = df['Cascading Event Type'].fillna('None')

In [7]:
# 2. Convert 'Time of Occurrence' to datetime format
df['Time of Occurrence'] = pd.to_datetime(df['Time of Occurrence'], format='%d-%m-%Y')

In [8]:
# 3. Convert necessary columns to numeric (if they are not)
df['Magnitude'] = pd.to_numeric(df['Magnitude'], errors='coerce')
df['Depth (km)'] = pd.to_numeric(df['Depth (km)'], errors='coerce')
df['PGA (Peak Ground Acceleration)'] = pd.to_numeric(df['PGA (Peak Ground Acceleration)'], errors='coerce')
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

In [9]:
# 4. Feature Engineering: Extract necessary columns for further analysis
features = df[['Magnitude', 'PGA (Peak Ground Acceleration)', 'Depth (km)', 'Latitude', 'Longitude', 'Time of Occurrence']]

In [10]:
# Display the cleaned and processed data
print("\nCleaned and Extracted Data:")
print(features.head())



Cleaned and Extracted Data:
   Magnitude  PGA (Peak Ground Acceleration)  Depth (km)   Latitude  \
0        6.1                            0.32          21  33.996715   
1        7.9                            0.33          15  34.051518   
2        7.2                            0.38          11  33.796659   
3        6.8                            0.30          28  34.020960   
4        5.5                            0.43          18  34.211914   

    Longitude Time of Occurrence  
0 -118.345031         2023-04-06  
1 -117.751207         2023-08-02  
2 -118.695034         2023-10-21  
3 -118.171458         2023-07-28  
4 -118.667428         2023-12-09  


In [11]:
# Save the cleaned dataset to a new CSV 
features.to_csv('cleaned_earthquake_data.csv', index=False)

In [15]:
#features['Year'] = df['Time of Occurrence'].dt.year
#features['Month'] = df['Time of Occurrence'].dt.month
#features['Day'] = df['Time of Occurrence'].dt.day


In [16]:
features['Log Building Failure Rate'] = np.log1p(df['Building Failure Rate'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Log Building Failure Rate'] = np.log1p(df['Building Failure Rate'])


In [14]:
# Save the cleaned dataset to a new CSV (optional)
features.to_csv('cleaned.csv', index=False)