# Earthquake prediction

In [2]:
import kagglehub
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Download dataset

In [16]:
# Download latest version
path = kagglehub.dataset_download("usgs/earthquake-database")
dest_path = "./dataset"

# Move to repo
shutil.move(path, dest_path)
print(f"File moved from {path} to {dest_path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/usgs/earthquake-database?dataset_version_number=1...


100%|██████████| 590k/590k [00:00<00:00, 1.00MB/s]

Extracting files...
File moved from C:\Users\ameli\.cache\kagglehub\datasets\usgs\earthquake-database\versions\1 to ./dataset





# SSN

In [15]:
df = pd.read_csv('./dataset/database.csv')

# Display the first few rows of the dataset
# print(data.head())

# Check NA values
print(df.isna().sum())

df.head()

Date                              0
Time                              0
Latitude                          0
Longitude                         0
Type                              0
Depth                             0
Depth Error                   18950
Depth Seismic Stations        16314
Magnitude                         0
Magnitude Type                    3
Magnitude Error               23084
Magnitude Seismic Stations    20847
Azimuthal Gap                 16112
Horizontal Distance           21807
Horizontal Error              22255
Root Mean Square               6059
ID                                0
Source                            0
Location Source                   0
Magnitude Source                  0
Status                            0
dtype: int64


Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [41]:
# Load and Preprocess Data
# Assuming you have a CSV file with earthquake data
df = pd.read_csv('./dataset/database.csv')

# Display the first few rows of the dataset
# print(data.head())

# Check NA values
# print(df.isna().sum())

print(df.head())
 
# print(df.dtypes)
# Filter rows where 'Date' does not match the pattern
date_pattern = r'^\d{2}/\d{2}/\d{4}$'
invalid_dates = df[~df['Date'].str.match(date_pattern, na=False)]
time_pattern = r'^\d{2}:\d{2}:\d{2}$'
invalid_times = df[~df['Time'].str.match(time_pattern, na=False)]


# Display invalid dates
print("Invalid Dates:")
print(invalid_dates)

print("Invalid Times:")
print(invalid_times)

df.loc[~df['Date'].str.match(date_pattern, na=False), 'Date'] = pd.to_datetime(df.loc[~df['Date'].str.match(date_pattern, na=False), 'Date'], errors='coerce').dt.strftime('%d/%m/%Y')
invalid_dates_after = df[~df['Date'].str.match(date_pattern, na=False)]

# Convert invalid 'Time' entries to datetime and extract only the time component
df.loc[~df['Time'].str.match(time_pattern, na=False), 'Time'] = pd.to_datetime(df.loc[~df['Time'].str.match(time_pattern, na=False), 'Time'], errors='coerce').dt.strftime('%M:%S:%2f')
invalid_times_after = df[~df['Time'].str.match(time_pattern, na=False)]


# Display invalid dates
print("Invalid Dates:")
print(invalid_dates_after)

print("Invalid Times:")
print(invalid_times_after)

# Convert 'Date' column to datetime, coercing errors
# df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

# print(df.head())
# print(df['Date'].isna().sum())

         Date      Time  Latitude  Longitude        Type  Depth  Depth Error  \
0  01/02/1965  13:44:18    19.246    145.616  Earthquake  131.6          NaN   
1  01/04/1965  11:29:49     1.863    127.352  Earthquake   80.0          NaN   
2  01/05/1965  18:05:58   -20.579   -173.972  Earthquake   20.0          NaN   
3  01/08/1965  18:49:43   -59.076    -23.557  Earthquake   15.0          NaN   
4  01/09/1965  13:32:50    11.938    126.427  Earthquake   15.0          NaN   

   Depth Seismic Stations  Magnitude Magnitude Type  ...  \
0                     NaN        6.0             MW  ...   
1                     NaN        5.8             MW  ...   
2                     NaN        6.2             MW  ...   
3                     NaN        5.8             MW  ...   
4                     NaN        5.8             MW  ...   

   Magnitude Seismic Stations  Azimuthal Gap  Horizontal Distance  \
0                         NaN            NaN                  NaN   
1                   

KeyboardInterrupt: 

In [None]:

# Convert 'Time' column to datetime and extract only the time component, coercing errors
df['Time'] = pd.to_datetime(df['Time'], format='%M:%S.%f', errors='coerce').dt.time

# Filter rows where 'Date' or 'Time' is NaT (indicating invalid format)
invalid_dates = df[df['Date'].isna()]
invalid_times = df[df['Time'].isna()]

# Display invalid dates and times
print("Invalid Dates:")
print(invalid_dates)

print("Invalid Times:")
print(invalid_times)

# Optionally, you can drop these invalid rows from the dataframe
df = df.dropna(subset=['Date', 'Time'])

# Display the first few rows of the cleaned dataset
print(df.head())

# Check the data types to confirm the changes
print(df.dtypes)

In [None]:
# Load and Preprocess Data
# Assuming you have a CSV file with earthquake data
data = pd.read_csv('./dataset/database.csv')

# Display the first few rows of the dataset
print(data.head())

# Preprocess the data (example: fill missing values, encode categorical variables, etc.)
data.fillna(method='ffill', inplace=True)

# Split the data into features and target
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Neural Network Model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Assuming a regression problem

# Compile the Model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the Model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the Model
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

# Make Predictions
predictions = model.predict(X_test)

# Plot the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()