Import Libraries

In [1]:
import pandas as pd
import numpy as np
import sqlite3 as sq
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

Load Data From Database

In [2]:
conn = sq.connect('../data/earthquakes.db') #Connect to database

retrieve_df_query = 'SELECT * FROM earthquakes;' #Query to retrieve all rows

loaded_earthquakes_df = pd.read_sql(retrieve_df_query, conn) #Run query to retrieve dataframe

conn.close() #Close the connection

loaded_earthquakes_df #Output dataframe

Unnamed: 0,time,latitude,longitude,depth,mag,magType,place,region,type
0,2025-11-13T00:31:55.930Z,59.728500,-152.003300,48.900,1.60,ml,"11 km WSW of Anchor Point, Alaska",Alaska,earthquake
1,2025-11-13T00:24:29.126Z,64.738100,-149.119300,16.700,1.50,ml,"15 km N of Four Mile Road, Alaska",Alaska,earthquake
2,2025-11-13T00:21:27.261Z,64.929100,-147.755700,0.000,1.20,ml,"3 km NW of Farmers Loop, Alaska",Alaska,earthquake
3,2025-11-13T00:19:21.141Z,59.719100,-151.184600,25.700,2.20,ml,"6 km ESE of Fritz Creek, Alaska",Alaska,earthquake
4,2025-11-13T00:15:44.040Z,38.824833,-122.801498,2.580,0.70,md,"7 km NW of The Geysers, CA",CA,earthquake
...,...,...,...,...,...,...,...,...,...
7150,2025-10-14T00:47:40.463Z,13.950400,-91.225700,62.751,4.00,mb,"23 km SW of La Gomera, Guatemala",Guatemala,earthquake
7151,2025-10-14T00:44:26.502Z,10.798300,123.994700,10.000,4.40,mb,"2 km SSE of Lugo, Philippines",Philippines,earthquake
7152,2025-10-14T00:40:29.080Z,38.627666,-119.795670,-0.710,2.37,md,"7 km S of Markleeville, CA",CA,earthquake
7153,2025-10-14T00:39:35.300Z,39.482333,-111.390333,1.730,1.05,ml,"9 km E of Spring City, Utah",Utah,earthquake


Define Features and Target Variable

In [3]:
X = loaded_earthquakes_df[['latitude', 'longitude', 'depth']] #Define features
y = loaded_earthquakes_df['mag'] #Define target variable

Split and Train

In [4]:
#Divide data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


Create, Train and Save the Model

In [5]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42) #Create the model

rf_model.fit(X_train, y_train) #Trains model on data

predictions = rf_model.predict(X_test) #Store predicted values for test set

#Save the .pkl file
with open('../data/model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


Test Accuracy

In [6]:
#Get mean for training data
mean_pred = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
baseline_mae = mean_absolute_error(y_test, mean_pred)
print(f'Mean value from training data: {baseline_mae}')

#Get mean of the model
model_mae = mean_absolute_error(y_test, predictions)
print(f'Mean value from model: {model_mae}')

Mean value from training data: 0.9683194019725212
Mean value from model: 0.3692798096568162
