In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
!pwd

/Users/Alenka/code/Tim-Frith/moonster_quakes/notebooks


## Import data for earthquakes

In [3]:
path = "/Users/Alenka/code/Tim-Frith/moonster_quakes/datasets/significant_earthquakes.csv"

In [4]:
earthquakes_df = pd.read_csv(path)

In [5]:
type(earthquakes_df)

pandas.core.frame.DataFrame

In [6]:
earthquakes_df.shape

(97395, 23)

In [7]:
earthquakes_df[earthquakes_df['mag']>7].shape

(1260, 23)

In [8]:
earthquakes_df[earthquakes_df['mag']>7].head()

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,0,1900-10-09T12:25:00.000Z,57.09,-153.48,,7.86,mw,,,,...,2022-05-09T14:44:17.838Z,"16 km SW of Old Harbor, Alaska",earthquake,,,,,reviewed,ushis,pt
16,16,1904-04-04T10:26:00.880Z,41.758,23.249,15.0,7.02,mw,,,,...,2022-04-26T14:54:31.433Z,"7 km SE of Stara Kresna, Bulgaria",earthquake,,4.8,0.4,,reviewed,iscgem,iscgem
18,18,1904-06-25T21:00:38.720Z,52.763,160.277,30.0,7.7,mw,,,,...,2022-04-25T20:22:48.406Z,"115 km ESE of Petropavlovsk-Kamchatsky, Russia",earthquake,,10.3,0.4,,reviewed,iscgem,iscgem
19,19,1904-06-25T14:45:39.140Z,51.424,161.638,15.0,7.5,mw,,,,...,2022-05-09T22:48:24.972Z,"274 km SE of Petropavlovsk-Kamchatsky, Russia",earthquake,,25.0,0.4,,reviewed,iscgem,iscgem
20,20,1904-08-30T11:43:20.850Z,30.684,100.608,15.0,7.09,mw,,,,...,2022-04-25T20:23:00.657Z,"150 km WNW of Kangding, China",earthquake,,25.0,0.4,,reviewed,iscgem,iscgem


Selected two earthquakes for test calculation in index row 16 and 18

In [9]:
earthquakes_df.loc[16:16]

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
16,16,1904-04-04T10:26:00.880Z,41.758,23.249,15.0,7.02,mw,,,,...,2022-04-26T14:54:31.433Z,"7 km SE of Stara Kresna, Bulgaria",earthquake,,4.8,0.4,,reviewed,iscgem,iscgem


In [10]:
earthquakes_df.loc[18:18]

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
18,18,1904-06-25T21:00:38.720Z,52.763,160.277,30.0,7.7,mw,,,,...,2022-04-25T20:22:48.406Z,"115 km ESE of Petropavlovsk-Kamchatsky, Russia",earthquake,,10.3,0.4,,reviewed,iscgem,iscgem


## Test function on 2 earthquakes

In [11]:
earthquakes_df.loc[16:16]['latitude'].values[0]

41.758

In [12]:
first_lat = earthquakes_df.loc[16:16]['latitude'].values[0]
first_lon = earthquakes_df.loc[16:16]['longitude'].values[0]
second_lat = earthquakes_df.loc[18:18]['latitude'].values[0]
second_lon = earthquakes_df.loc[18:18]['longitude'].values[0]

In [13]:
type(first_lat)

numpy.float64

In [14]:
def distances_vectorized(first_lat: float, first_lon: float, second_lat: float, second_lon: float):
    """
    Calculate the haversine and Manhattan distances between two points (specified in decimal degrees).
    Vectorized version for pandas df
    Computes distance in Km
    """
    earth_radius = 6371

    lat_1_rad, lon_1_rad = np.radians(first_lat), np.radians(first_lon)
    lat_2_rad, lon_2_rad = np.radians(second_lat), np.radians(second_lon)

    dlon_rad = lon_2_rad - lon_1_rad
    dlat_rad = lat_2_rad - lat_1_rad

    manhattan_rad = np.abs(dlon_rad) + np.abs(dlat_rad)
    manhattan_km = manhattan_rad * earth_radius

    a = (np.sin(dlat_rad / 2.0)**2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon_rad / 2.0)**2)
    haversine_rad = 2 * np.arcsin(np.sqrt(a))
    haversine_km = haversine_rad * earth_radius

    return dict(
        haversine_in_km = haversine_km,
        manhattan_in_km = manhattan_km
    )

In [15]:
distances_check = distances_vectorized(first_lat, first_lon, second_lat, second_lon)

In [16]:
distances_check

{'haversine_in_km': 8725.08781194689, 'manhattan_in_km': 16460.518575973965}

## Adding new columns

In [107]:
from sqlite3 import connect
conn = connect("../datasets/database.db")
c = conn.cursor()

In [108]:
moons_df = pd.read_sql('SELECT * FROM moons_table', conn)

In [109]:
moons_df

Unnamed: 0,moon_ID,time
0,0,356
1,1,713
2,2,1071
3,3,1428
4,4,1785
...,...,...
3730,3730,1322140
3731,3731,1322494
3732,3732,1322848
3733,3733,1323203


In [111]:
quake_df = pd.read_sql('SELECT * FROM quake_table', conn)
quake_df

Unnamed: 0,quake_ID,time,latitude,longitude,depth,mag
0,0,6756,57.090000,-153.480000,,7.86
1,16,37306,41.758000,23.249000,15.00,7.02
2,17,37306,41.802000,23.108000,15.00,6.84
3,18,39285,52.763000,160.277000,30.00,7.70
4,19,39279,51.424000,161.638000,15.00,7.50
...,...,...,...,...,...,...
24255,96941,1081579,40.377500,-125.562833,10.00,5.58
24256,97003,1081354,40.196000,-121.099833,6.06,5.16
24257,97006,1081343,40.204167,-121.109500,5.85,5.48
24258,97073,1081579,40.377500,-125.562833,10.00,5.58


In [112]:
def last_moon(quake_time):
    query = f"""
        SELECT time 
        FROM moons_table
        WHERE time < {quake_time}
        ORDER BY time DESC 
        LIMIT 1 
      """
    
    c.execute(query)
    last_moon_time = c.fetchone()[0]
    
    return last_moon_time

last_moon(6756), last_moon(37306)

(6734, 37214)

In [113]:
def next_moon(quake_time):
    query = f"""
        SELECT time
        FROM moons_table
        WHERE time > {quake_time}
        ORDER BY time ASC 
        LIMIT 1 
      """
    c.execute(query)
    next_moon_time = c.fetchone()[0]
    
    return next_moon_time

next_moon(6756), next_moon(37306)

(7087, 37567)

In [114]:
quake_df['time_since_last_moon'] = quake_df['time'].apply(lambda x: x - last_moon(x))
quake_df['time_to_next_moon'] = quake_df['time'].apply(lambda x: next_moon(x)-x)

In [122]:
quake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24260 entries, 0 to 24259
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   quake_ID              24260 non-null  int64  
 1   time                  24260 non-null  int64  
 2   latitude              24260 non-null  float64
 3   longitude             24260 non-null  float64
 4   depth                 24253 non-null  float64
 5   mag                   24260 non-null  float64
 6   time_since_last_moon  24260 non-null  int64  
 7   time_to_next_moon     24260 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 1.5 MB


In [128]:
quake_df['time_to_nearest_moon'] = quake_df[["time_since_last_moon", "time_to_next_moon"]].apply(lambda row: min(row["time_since_last_moon"], row["time_to_next_moon"]), axis = 1)

In [129]:
quake_df

Unnamed: 0,quake_ID,time,latitude,longitude,depth,mag,time_since_last_moon,time_to_next_moon,time_to_nearest_moon
0,0,6756,57.090000,-153.480000,,7.86,22,331,22
1,16,37306,41.758000,23.249000,15.00,7.02,92,261,92
2,17,37306,41.802000,23.108000,15.00,6.84,92,261,92
3,18,39285,52.763000,160.277000,30.00,7.70,305,48,48
4,19,39279,51.424000,161.638000,15.00,7.50,299,54,54
...,...,...,...,...,...,...,...,...,...
24255,96941,1081579,40.377500,-125.562833,10.00,5.58,30,323,30
24256,97003,1081354,40.196000,-121.099833,6.06,5.16,158,195,158
24257,97006,1081343,40.204167,-121.109500,5.85,5.48,147,206,147
24258,97073,1081579,40.377500,-125.562833,10.00,5.58,30,323,30


Unnamed: 0,quake_ID,time,latitude,longitude,depth,mag,time_since_last_moon,time_to_next_moon,time_to_nearest_moon
0,0,6756,57.09,-153.48,,7.86,22,331,22
91,113,55176,32.9,-115.5,,6.28,233,119,119
199,223,67634,34.2,-117.1,,5.3,308,45,45
304,344,90880,33.7,-117.4,,5.3,149,207,149
590,646,135629,32.8,-115.5,,5.5,255,96,96
591,647,135628,32.8,-115.5,,5.5,254,97,97
683,749,147339,34.9,-118.9,,5.96,283,72,72


In [None]:
def last_quakes(quake_time):
    query = f"""
        SELECT * 
        FROM moons_table
        WHERE time < {quake_time}
        ORDER BY time DESC 
        LIMIT 1 
      """
    
    c.execute(query)
    last_moon_time = c.fetchone()[1]
    
    return dict_last_quakes



In [43]:
quake_df['time'].iloc[100]

55734

In [34]:
mask = moons_df['time']<=6756
moons_df[mask]

Unnamed: 0,moon_ID,time
0,0,356
1,1,713
2,2,1071
3,3,1428
4,4,1785
5,5,2142
6,6,2498
7,7,2853
8,8,3209
9,9,3563


Unnamed: 0,quake_ID,time,latitude,longitude,depth,mag,time_since_last_moon,time_to_next_moon
0,0,6756,57.090000,-153.480000,,7.86,22,331
1,16,37306,41.758000,23.249000,15.00,7.02,92,261
2,17,37306,41.802000,23.108000,15.00,6.84,92,261
3,18,39285,52.763000,160.277000,30.00,7.70,305,48
4,19,39279,51.424000,161.638000,15.00,7.50,299,54
...,...,...,...,...,...,...,...,...
24255,96941,1081579,40.377500,-125.562833,10.00,5.58,30,323
24256,97003,1081354,40.196000,-121.099833,6.06,5.16,158,195
24257,97006,1081343,40.204167,-121.109500,5.85,5.48,147,206
24258,97073,1081579,40.377500,-125.562833,10.00,5.58,30,323


-------
Things noticed - TODOS
 - Quake depth missing for 7 quakes

In [138]:
mask = quake_df['depth'].isna() == True
quake_df[mask]

Unnamed: 0,quake_ID,time,latitude,longitude,depth,mag,time_since_last_moon,time_to_next_moon,time_to_nearest_moon
0,0,6756,57.09,-153.48,,7.86,22,331,22
91,113,55176,32.9,-115.5,,6.28,233,119,119
199,223,67634,34.2,-117.1,,5.3,308,45,45
304,344,90880,33.7,-117.4,,5.3,149,207,149
590,646,135629,32.8,-115.5,,5.5,255,96,96
591,647,135628,32.8,-115.5,,5.5,254,97,97
683,749,147339,34.9,-118.9,,5.96,283,72,72


## Functions from past challenges

Maybe to be used when we are transferring dataframe

In [None]:
# def transform_lonlat_features(X: pd.DataFrame) -> pd.DataFrame:
#     lonlat_features = ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"]
#     assert isinstance(X, pd.DataFrame)
#     res = distances_vectorized(X, *lonlat_features)

#     return pd.DataFrame(res)

In [None]:
# def distances_vectorized(df: pd.DataFrame, start_lat: str, start_lon: str, end_lat: str, end_lon: str) -> dict:
#     """
#     Calculate the haversine and Manhattan distances between two points (specified in decimal degrees).
#     Vectorized version for pandas df
#     Computes distance in Km
#     """
#     earth_radius = 6371

#     lat_1_rad, lon_1_rad = np.radians(df[start_lat]), np.radians(df[start_lon])
#     lat_2_rad, lon_2_rad = np.radians(df[end_lat]), np.radians(df[end_lon])

#     dlon_rad = lon_2_rad - lon_1_rad
#     dlat_rad = lat_2_rad - lat_1_rad

#     manhattan_rad = np.abs(dlon_rad) + np.abs(dlat_rad)
#     manhattan_km = manhattan_rad * earth_radius

#     a = (np.sin(dlat_rad / 2.0)**2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon_rad / 2.0)**2)
#     haversine_rad = 2 * np.arcsin(np.sqrt(a))
#     haversine_km = haversine_rad * earth_radius

#     return dict(
#         haversine = haversine_km,
#         manhattan = manhattan_km
#     )