In [1]:
import pandas as pd
import numpy as np
df = pd.read_parquet('earthquakes.parquet')

In [2]:
df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2025-07-31T09:10:55.974Z,52.0439,160.4564,10.0,5.3,mb,105.0,77.0,1.476,1.26,...,2025-07-31T10:04:36.040Z,"168 km SE of Petropavlovsk-Kamchatsky, Russia",earthquake,8.19,1.81,0.027,465.0,reviewed,us,us
1,2025-07-31T08:18:36.686Z,-5.4288,146.0565,59.026,4.6,mb,33.0,136.0,3.608,0.54,...,2025-07-31T09:28:57.040Z,"37 km SE of Madang, Papua New Guinea",earthquake,11.03,7.853,0.066,68.0,reviewed,us,us
2,2025-07-31T07:42:38.478Z,49.8822,157.446,10.0,5.6,mb,132.0,58.0,3.234,0.95,...,2025-07-31T10:14:38.776Z,"128 km SE of Severo-Kuril’sk, Russia",earthquake,8.24,1.787,0.02,862.0,reviewed,us,us
3,2025-07-31T07:21:40.909Z,49.2332,157.8546,10.0,5.2,mb,113.0,61.0,3.828,0.94,...,2025-07-31T07:43:21.040Z,"202 km SE of Severo-Kuril’sk, Russia",earthquake,6.1,1.804,0.02,797.0,reviewed,us,us
4,2025-07-31T06:59:47.894Z,52.6423,158.8071,78.377,5.3,mb,103.0,104.0,0.393,0.87,...,2025-07-31T07:20:11.040Z,"42 km SE of Vilyuchinsk, Russia",earthquake,8.28,5.212,0.02,874.0,reviewed,us,us


In [3]:
df1 = df.copy()

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
df1.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2025-07-31T09:10:55.974Z,52.0439,160.4564,10.0,5.3,mb,105.0,77.0,1.476,1.26,us,usd0013iej,2025-07-31T10:04:36.040Z,"168 km SE of Petropavlovsk-Kamchatsky, Russia",earthquake,8.19,1.81,0.027,465.0,reviewed,us,us
1,2025-07-31T08:18:36.686Z,-5.4288,146.0565,59.026,4.6,mb,33.0,136.0,3.608,0.54,us,us6000qwvk,2025-07-31T09:28:57.040Z,"37 km SE of Madang, Papua New Guinea",earthquake,11.03,7.853,0.066,68.0,reviewed,us,us
2,2025-07-31T07:42:38.478Z,49.8822,157.446,10.0,5.6,mb,132.0,58.0,3.234,0.95,us,us6000qwvb,2025-07-31T10:14:38.776Z,"128 km SE of Severo-Kuril’sk, Russia",earthquake,8.24,1.787,0.02,862.0,reviewed,us,us
3,2025-07-31T07:21:40.909Z,49.2332,157.8546,10.0,5.2,mb,113.0,61.0,3.828,0.94,us,us6000qwv7,2025-07-31T07:43:21.040Z,"202 km SE of Severo-Kuril’sk, Russia",earthquake,6.1,1.804,0.02,797.0,reviewed,us,us
4,2025-07-31T06:59:47.894Z,52.6423,158.8071,78.377,5.3,mb,103.0,104.0,0.393,0.87,us,us6000qwv0,2025-07-31T07:20:11.040Z,"42 km SE of Vilyuchinsk, Russia",earthquake,8.28,5.212,0.02,874.0,reviewed,us,us


In [7]:
print(df1.columns)

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')


In [8]:
print(df1["type"].value_counts())

type
earthquake    73419
Name: count, dtype: int64


In [11]:
#selecting only necessary columns
df1 = df1[['latitude', 'longitude', 'depth', 'mag']]

In [12]:
for i in df1.columns:
    print(f"column_name: {i}")
    print(df[i].isnull().sum())

column_name: latitude
0
column_name: longitude
0
column_name: depth
0
column_name: mag
0


In [13]:
df2 = df1.copy()

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df2[['depth','mag']] = scaler.fit_transform(df2[['depth','mag']])

In [15]:
print (df2.head())

   latitude  longitude     depth       mag
0   52.0439   160.4564 -0.458333  1.336499
1   -5.4288   146.0565 -0.029300 -0.549858
2   49.8822   157.4460 -0.458333  2.144938
3   49.2332   157.8546 -0.458333  1.067020
4   52.6423   158.8071  0.140043  1.336499


In [16]:
coords = df2[['latitude', 'longitude']].to_numpy()
coords_radians = np.radians(coords)

In [17]:
kms_per_radian = 6371.0088
epsilon = 50 / kms_per_radian 

In [18]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=epsilon, min_samples=10, metric='haversine').fit(coords_radians)

In [19]:
df2['cluster'] = db.labels_

In [20]:
df2.head()

Unnamed: 0,latitude,longitude,depth,mag,cluster
0,52.0439,160.4564,-0.458333,1.336499,0
1,-5.4288,146.0565,-0.0293,-0.549858,1
2,49.8822,157.446,-0.458333,2.144938,0
3,49.2332,157.8546,-0.458333,1.06702,0
4,52.6423,158.8071,0.140043,1.336499,0


In [21]:
df1['cluster'] = df2['cluster']
df1.head()

Unnamed: 0,latitude,longitude,depth,mag,cluster
0,52.0439,160.4564,10.0,5.3,0
1,-5.4288,146.0565,59.026,4.6,1
2,49.8822,157.446,10.0,5.6,0
3,49.2332,157.8546,10.0,5.2,0
4,52.6423,158.8071,78.377,5.3,0


In [23]:
print(df1['cluster'].nunique())

451


In [24]:
df1.groupby('cluster')['cluster'].count()

cluster
-1       7089
 0       7189
 1       3966
 2      10170
 3       5265
        ...  
 445       10
 446       13
 447       14
 448       15
 449       10
Name: cluster, Length: 451, dtype: int64

In [25]:
n_clusters = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0) 
n_noise = list(db.labels_).count(-1)

print(n_clusters)
print(n_noise)

450
7089


In [26]:
df1.to_csv('output_data.csv', index=False)