In [95]:
import requests, re, time, os, json, pickle, shutil, pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns

In [96]:
first=pd.read_csv("backup_in_case_copy.csv", index_col="index")

In [97]:
first = first.drop("Unnamed: 2", axis=1)
first.head(5)

Unnamed: 0_level_0,data
index,Unnamed: 1_level_1
0,01-01-2011 00:57:28.6 37.317°N 08.522°W 15 1.1...
1,01-01-2011 02:43:51.3 38.669°N 08.486°W 0* 0.9...
2,01-01-2011 06:50:57.8 36.538°N 07.663°W 31* 1....
3,01-01-2011 06:58:33.7 36.355°N 09.698°W 31* 1....
4,01-01-2011 07:08:21.1 37.036°N 04.850°W 0* 1.9...


In [98]:
l1 = list(first["data"])
l2 = [s.split(",") for s in l1]
l3 = [s[0].replace(" ml", "ml").split() for s in l2]

KeyboardInterrupt: 

In [None]:
merged_results = []

for row in l3:
    # Look for elements that contain "ml"
    ml_found = False  # Flag to track if "ml" is found
    for i, element in enumerate(row):
        if "ml" in element:
            # Join all elements after the one that contains "ml"
            merged_string = ' '.join(row[i + 1:])  # Join elements after "ml"
            merged_results.append(merged_string)
            ml_found = True  # Set the flag to True
            break  # Exit the loop after finding the first "ml"
    
    if not ml_found:
        merged_results.append(" ")  # Append a space if "ml" is not found

In [None]:
# Pattern to match the numeric value at the end of the string
pattern = r'(\d\.\d)(.*)$'

# Initialize lists to hold extracted values
magnitudes = []
parameters = []


for entry in merged_results:
    match = re.search(pattern, entry)
    if match:
        magnitudes.append(match.group(1))  # The numeric value
        parameters.append(match.group(2).strip())  # Any parameters after the number
    else:
        magnitudes.append(" ")
        parameters.append(" ")

In [None]:
l3_first_six_columns = [row[:6] for row in l3]

In [None]:
l3_df = pd.DataFrame(l3_first_six_columns, columns=["date","time", "lat","lon","depth","mag"])
display(l3_df.shape)
l3_df.head(5)

(2556354, 6)

Unnamed: 0,date,time,lat,lon,depth,mag
0,01-01-2011,00:57:28.6,37.317°N,08.522°W,15,1.1ml
1,01-01-2011,02:43:51.3,38.669°N,08.486°W,0*,0.9ml
2,01-01-2011,06:50:57.8,36.538°N,07.663°W,31*,1.5ml
3,01-01-2011,06:58:33.7,36.355°N,09.698°W,31*,1.7ml
4,01-01-2011,07:08:21.1,37.036°N,04.850°W,0*,1.9ml


In [None]:
rms_df = pd.DataFrame(magnitudes, columns=["Rms"])
display(rms_df.shape)
rms_df.head(5)

(2556354, 1)

Unnamed: 0,Rms
0,0.4
1,0.3
2,0.3
3,0.3
4,0.4


In [None]:
int_df = pd.DataFrame(parameters, columns=["Int"])
display(int_df.shape)
int_df.head(5)

(2556354, 1)

Unnamed: 0,Int
0,
1,
2,
3,
4,


In [None]:
together = pd.concat([l3_df, rms_df, int_df], axis=1)

In [None]:
# Convert 'date' to datetime
together['date'] = pd.to_datetime(together['date'], format='%d-%m-%Y')

# Convert 'time' to timedelta
together['time'] = pd.to_timedelta(together['time'])

# Combine 'date' and 'time' into a single datetime column
together['datetime'] = together['date'] + together['time']

In [None]:
together2 = together.sort_values(by="datetime")

In [None]:
together2 = together2.drop_duplicates().reset_index(drop=True)

In [None]:
together2 = together2.drop(columns=["date", "time"])

In [None]:
# Remove 'ml' from the 'mag' column
together2['mag'] = together2['mag'].str.replace('ml', '', regex=False)
together2['depth'] = together2['depth'].str.replace('*', '', regex=False)

# Optionally, convert the 'mag' column to numeric if needed
together2['mag'] = pd.to_numeric(together2['mag'], errors='coerce')
# Optionally, convert the 'mag' column to numeric if needed
together2['Rms'] = pd.to_numeric(together2['Rms'], errors='coerce')
# Optionally, convert the 'mag' column to numeric if needed
together2['depth'] = pd.to_numeric(together2['depth'], errors='coerce')

In [None]:
together3 = together2.set_index("datetime")

In [None]:
together3.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 36135 entries, 2005-01-01 22:42:36.500000 to 2023-10-31 23:29:24.500000
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     36135 non-null  object 
 1   lon     36135 non-null  object 
 2   depth   36130 non-null  float64
 3   mag     35960 non-null  float64
 4   Rms     35573 non-null  float64
 5   Int     36135 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.9+ MB


In [None]:
#together3["sensed"] = together3.apply(lambda x: 1 if together3["Int"].isnull else 0)
together3['sensed'] = (together3['Int'].notnull() & (together3['Int'] != "") & (together3['Int'] != " "))
together3['sensed'] = together3['sensed'].astype(int)

In [None]:
together3[together3["sensed"]==1]

Unnamed: 0_level_0,lat,lon,depth,mag,Rms,Int,sensed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-03 11:34:16.200,36.694°N,07.609°W,19.0,4.1,0.3,II/III,1
2005-01-11 09:29:11.200,38.545°N,08.271°W,15.0,3.8,0.3,IV,1
2005-01-12 17:40:37.700,41.709°N,07.565°W,0.0,1.8,0.3,II/III,1
2005-01-23 06:15:37.800,38.941°N,09.220°W,22.0,2.8,0.3,III/IV,1
2005-02-13 13:16:00.900,38.160°N,08.406°W,22.0,2.6,0.3,II/III,1
...,...,...,...,...,...,...,...
2023-07-27 11:18:23.100,39.630°N,08.199°W,10.0,3.0,0.4,IV,1
2023-08-05 01:37:04.700,39.803°N,08.641°W,11.0,2.7,0.4,III/IV,1
2023-08-13 21:16:46.800,38.649°N,08.213°W,3.0,2.7,0.4,IV,1
2023-08-13 21:24:33.000,38.661°N,08.213°W,4.0,2.9,0.4,IV,1


In [None]:
# Function to convert latitude and longitude to decimal
def convert_lat_lon(lat, lon):
    # Convert latitude
    lat_value = float(lat[:-2])  # Get the numeric part
    if lat.endswith('S'):  # South is negative
        lat_value = -lat_value
    
    # Convert longitude
    lon_value = float(lon[:-2])  # Get the numeric part
    if lon.endswith('W'):  # West is negative
        lon_value = -lon_value
    
    return lat_value, lon_value

# Apply the conversion function
together3[['lat_decimal', 'lon_decimal']] = together3.apply(
    lambda row: pd.Series(convert_lat_lon(row['lat'], row['lon'])),
    axis=1
)

In [None]:
together3

Unnamed: 0_level_0,lat,lon,depth,mag,Rms,Int,sensed,lat_decimal,lon_decimal
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-01 22:42:36.500,35.367°N,04.122°W,5.0,2.0,0.3,,0,35.367,-4.122
2005-01-02 02:03:51.000,35.109°N,05.403°W,5.0,1.6,0.2,,0,35.109,-5.403
2005-01-03 02:08:09.400,42.994°N,09.270°W,12.0,2.1,0.3,,0,42.994,-9.270
2005-01-03 11:34:16.200,36.694°N,07.609°W,19.0,4.1,0.3,II/III,1,36.694,-7.609
2005-01-04 00:50:51.900,35.928°N,04.862°W,34.0,3.3,0.4,,0,35.928,-4.862
...,...,...,...,...,...,...,...,...,...
2023-10-31 18:59:47.400,36.657°N,11.057°W,9.0,1.7,0.4,,0,36.657,-11.057
2023-10-31 19:31:01.700,36.729°N,07.581°W,26.0,0.6,0.4,,0,36.729,-7.581
2023-10-31 19:50:13.000,36.753°N,07.509°W,22.0,0.7,0.1,,0,36.753,-7.509
2023-10-31 21:40:52.400,33.717°N,05.461°W,22.0,2.4,0.4,,0,33.717,-5.461


In [None]:
#together3.to_csv("checkpoint1.csv")
#together3.to_pickle("checkpoint1.pkl")


In [99]:
together3= pd.read_csv("checkpoint1.csv")

In [102]:
together3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36135 entries, 0 to 36134
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   datetime     36135 non-null  object 
 1   lat          36135 non-null  object 
 2   lon          36135 non-null  object 
 3   depth        36130 non-null  float64
 4   mag          35960 non-null  float64
 5   Rms          35573 non-null  float64
 6   Int          876 non-null    object 
 7   sensed       36135 non-null  int64  
 8   lat_decimal  36135 non-null  float64
 9   lon_decimal  36135 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 2.8+ MB


In [108]:
import math

def convert_lat_lon(coord):
    c_value = float(coord[:-2])  # Remove the last two characters (°N or °S)
    

    if ('S' in coord or "W" in coord):
        c_value = -c_value  # Convert to negative for South
    
    
    return c_value

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))
    
    # Radius of Earth in kilometers (mean radius)
    r = 6371.0
    distance = r * c
    return distance

# MORF
latitude = 37.304321
longitude  = -8.652672

# Example usage
event_lat = 34.05  # Latitude of event
event_lon = -118.25  # Longitude of event
station_lat = latitude  # Latitude of station
station_lon = longitude  # Longitude of station

together3["dist_MORF"] = together3.apply(lambda row: haversine(convert_lat_lon(row["lat"]), convert_lat_lon(row["lon"]), station_lat, station_lon), axis=1)

In [109]:
together3

Unnamed: 0,datetime,lat,lon,depth,mag,Rms,Int,sensed,lat_decimal,lon_decimal,dist_MORF
0,2005-01-01 22:42:36.500,35.367°N,04.122°W,5.0,2.0,0.3,,0,35.367,-4.122,459.383777
1,2005-01-02 02:03:51.000,35.109°N,05.403°W,5.0,1.6,0.2,,0,35.109,-5.403,380.218037
2,2005-01-03 02:08:09.400,42.994°N,09.270°W,12.0,2.1,0.3,,0,42.994,-9.270,634.829841
3,2005-01-03 11:34:16.200,36.694°N,07.609°W,19.0,4.1,0.3,II/III,1,36.694,-7.609,114.871794
4,2005-01-04 00:50:51.900,35.928°N,04.862°W,34.0,3.3,0.4,,0,35.928,-4.862,371.283993
...,...,...,...,...,...,...,...,...,...,...,...
36130,2023-10-31 18:59:47.400,36.657°N,11.057°W,9.0,1.7,0.4,,0,36.657,-11.057,225.363778
36131,2023-10-31 19:31:01.700,36.729°N,07.581°W,26.0,0.6,0.4,,0,36.729,-7.581,114.653314
36132,2023-10-31 19:50:13.000,36.753°N,07.509°W,22.0,0.7,0.1,,0,36.753,-7.509,118.596364
36133,2023-10-31 21:40:52.400,33.717°N,05.461°W,22.0,2.4,0.4,,0,33.717,-5.461,492.438444


In [None]:
#together3.to_csv("checkpoint1.csv")