In [None]:
import requests, re, time, os, json, pickle, shutil, pdfplumber, zipfile, math
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns

In [None]:
start = 2005

item_list = []

for year in range(start, 2025):
    print(year)
    url = f"https://www.ipma.pt/pt/publicacoes/boletins.jsp?cmbDep=sis&cmbTema=bsi&cmbAno={str(year)}&idDep=sis&idTema=bsi&curAno={str(year)}"
    response = requests.get(url)
    stat = response.status_code
    soup = BeautifulSoup(response.content, 'html.parser')

    time.sleep(1)
    b_url="https://www.ipma.pt"
    items = []

    for i in soup.find_all("td", "class"=="apli_sat_img"):
        items.append(i.find("a"))

        items = list(filter(lambda items: items is not None, items))

        for j in range(0,len(items)):
            item_list.append(b_url+items[j]["href"])
            
item_list = [item for item in item_list if "bsi_mm_pm" in item]

In [None]:
item_list[0].split('/')[-1].split(".")[0]

In [None]:
## download to /data

def download_file(url):
    local_filename = url.split('/')[-1]
    current_dir = os.getcwd()
    root_dir = os.path.abspath(os.path.join(current_dir, '..', '..')) 
    save_path = os.path.join(root_dir, 'data', local_filename)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with requests.get(url, stream=True) as r:
        with open(save_path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

for j in item_list:
    download_file(j)

In [None]:
# Specify the zip file folder
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
directory_path = os.path.join(root_dir, 'data') # Change this to your folder path

# Create a list of filenames
zip_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.lower().endswith('.zip')]

def unzip_pdfs(zip_files, extract_to_folder):
    os.makedirs(extract_to_folder, exist_ok=True)
    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            for file_name in zip_ref.namelist():
                if file_name.lower().endswith('.pdf'):
                    zip_ref.extract(file_name, extract_to_folder)

extract_to_folder = os.path.join(directory_path, 'pdfs')  # Folder to save the extracted PDFs
unzip_pdfs(zip_files, extract_to_folder)

In [None]:
# Specify the pdfs folder
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
directory_path = os.path.join(root_dir, 'data', 'pdfs')

file_names = [file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

table = []
for file in file_names:
    pdf_file_path = os.path.join(root_dir, 'data','pdfs', file)
    print(pdf_file_path)
    pattern = (r"\d\d-\d\d-\d\d\d\d \d\d:\d\d:\d\d.\d \d\d.\d\d\d.\S \d\d.\d\d\d")

    page_list=[]

    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text = page.extract_text()
            page_list.append(text)
            
    for pag in page_list:
        paging = pag.split("\n")
        for p in paging:
            if re.match(pattern, p):
                table.append(p)
        

In [None]:
table2 = [line.split(",") for line in table]

In [None]:
table3 = []

for i in range(len(table2)):    
    j = table2[i][0].replace(" ",",")
    table3.append(list(j))

In [None]:
# Convert spaces to commas while keeping each element separate
converted_data = [item[0].replace(" ", ",").split(",") for item in table2]

# Process each row
processed_data = []

for row in converted_data:
    if 'ml' in row:
        ml_index = row.index('ml')  # Find the index of "ml"
        
        # Move the last element to the position after "ml"
        last_element = row.pop()  # Remove the last element
        row.insert(ml_index + 1, last_element)  # Insert it after "ml"
        
        # Join remaining elements into a single string
        joined_row = ' '.join(row)
        processed_data.append(joined_row)



In [None]:
display(processed_data)

In [None]:
df = pd.DataFrame(table2, columns=["data"])
df.head(5)

In [None]:
l1 = list(df["data"])

In [None]:
l2 = [s.split(",") for s in l1]

In [None]:
# Splitting each string into elements
l3 = [s[0].replace(" ml", "ml").split() for s in l2]

In [None]:
merged_results = []

for row in l3:
    # Look for elements that contain "ml"
    ml_found = False  # Flag to track if "ml" is found
    for i, element in enumerate(row):
        if "ml" in element:
            # Join all elements after the one that contains "ml"
            merged_string = ' '.join(row[i + 1:])  # Join elements after "ml"
            merged_results.append(merged_string)
            ml_found = True  # Set the flag to True
            break  # Exit the loop after finding the first "ml"
    
    if not ml_found:
        merged_results.append(" ")  # Append a space if "ml" is not found

In [None]:
len(merged_results)

In [None]:
# Pattern to match the numeric value at the end of the string
pattern = r'(\d\.\d)(.*)$'

# Initialize lists to hold extracted values
magnitudes = []
parameters = []


for entry in merged_results:
    match = re.search(pattern, entry)
    if match:
        magnitudes.append(match.group(1))  # The numeric value
        parameters.append(match.group(2).strip())  # Any parameters after the number
    else:
        magnitudes.append(" ")
        parameters.append(" ")



In [None]:
l3[0]

In [None]:
l3_first_six_columns = [row[:6] for row in l3]

In [None]:
l3_df = pd.DataFrame(l3_first_six_columns, columns=["date","time", "lat","lon","depth","mag"])
display(l3_df.shape)
l3_df.head(5)

In [None]:
rms_df = pd.DataFrame(magnitudes, columns=["Rms"])
display(rms_df.shape)
rms_df.head(5)


In [None]:
int_df = pd.DataFrame(parameters, columns=["Int"])
display(int_df.shape)
int_df.head(5)

In [None]:
together = pd.concat([l3_df, rms_df, int_df], axis=1)

In [None]:
together.shape

In [None]:
# Convert 'date' to datetime
together['date'] = pd.to_datetime(together['date'], format='%d-%m-%Y')

# Convert 'time' to timedelta
together['time'] = pd.to_timedelta(together['time'])

# Combine 'date' and 'time' into a single datetime column
together['datetime'] = together['date'] + together['time']

In [None]:
together.describe()

In [None]:
together.head(5)

In [None]:
together.duplicated().sum()

In [None]:
together = together.drop(columns=["date", "time"])

In [None]:
together2 = together.copy()

In [None]:
# Remove 'ml' from the 'mag' column
together2['mag'] = together2['mag'].str.replace('ml', '', regex=False)
together2['depth'] = together2['depth'].str.replace('*', '', regex=False)

# Optionally, convert the 'mag' column to numeric if needed
together2['mag'] = pd.to_numeric(together2['mag'], errors='coerce')
# Optionally, convert the 'mag' column to numeric if needed
together2['Rms'] = pd.to_numeric(together2['Rms'], errors='coerce')
# Optionally, convert the 'mag' column to numeric if needed
together2['depth'] = pd.to_numeric(together2['depth'], errors='coerce')

In [None]:
together3 = together2.set_index("datetime")

In [None]:
together3.info()

In [None]:
#together3["sensed"] = together3.apply(lambda x: 1 if together3["Int"].isnull else 0)
together3['sensed'] = (together3['Int'].notnull() & (together3['Int'] != "") & (together3['Int'] != " "))
together3['sensed'] = together3['sensed'].astype(int)

In [None]:
# Function to convert latitude and longitude to decimal
def convert_lat_lon(lat, lon):
    # Convert latitude
    lat_value = float(lat[:-2])  # Get the numeric part
    if lat.endswith('S'):  # South is negative
        lat_value = -lat_value
    
    # Convert longitude
    lon_value = float(lon[:-2])  # Get the numeric part
    if lon.endswith('W'):  # West is negative
        lon_value = -lon_value
    
    return lat_value, lon_value

# Apply the conversion function
together3[['lat_decimal', 'lon_decimal']] = together3.apply(
    lambda row: pd.Series(convert_lat_lon(row['lat'], row['lon'])),
    axis=1
)

In [None]:
def convert_lat_lon(coord):
    c_value = float(coord[:-2])  # Remove the last two characters (°N or °S)
    

    if ('S' in coord or "W" in coord):
        c_value = -c_value  # Convert to negative for South
    
    
    return c_value

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))
    
    # Radius of Earth in kilometers (mean radius)
    r = 6371.0
    distance = r * c
    return distance

# MORF
latitude = 37.304321
longitude  = -8.652672

# Example usage
event_lat = 34.05  # Latitude of event
event_lon = -118.25  # Longitude of event
station_lat = latitude  # Latitude of station
station_lon = longitude  # Longitude of station

together3["dist_MORF"] = together3.apply(lambda row: haversine(convert_lat_lon(row["lat"]), convert_lat_lon(row["lon"]), station_lat, station_lon), axis=1)

In [None]:
# Specify the save folder
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
directory_path = os.path.join(root_dir, 'dbs') # Change this to your folder path
os.makedirs(directory_path, exist_ok=True)
file_path = os.path.join(directory_path, "checkpoint1.csv")
together3.to_csv(file_path)