In [None]:
import os
import sys
import requests
import time
import gc
import pandas as pd
import cloudscraper

from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

from scripts import preprocessing

load_dotenv()
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not
try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# Purpose

The primary objectives of this project include:

- **Feasibility of Database Recreation:** Evaluating the practicality of recreating a comprehensive meteorite database by aggregating data from a single, extensive request.
- **Establishment of Initial Search Step:** Setting up the foundational step in the meteorite property search process, where one request retrieves all necessary URLs, types, names, places of fall, etc. This is aimed at reducing the overall impact on server resources compared to performing a new search request each time a user initiates a search. Instead, the user will search through a JSON file extracted from the webpage, not the page itself.
- **Reproducibility:** Ensuring the process can be automatically updated with tools like GitHub Actions or other automation tools configured similarly to CRON jobs. The goal is for this notebook to pave the way for a script that facilitates the "get-links" pipeline, making the dataset self-updating and reliable over time.
- **Handling Large Requests:** Recognizing that the request involves parsing over 4 million lines of HTML, which introduces potential for errors such as server timeouts or connection interruptions. Implementing checks is crucial, for example, verifying the end of the file for a closing body tag or closing HTML tag (`</html>`, `</body>`) to ensure the complete dataset is captured.
- **Export Format:** The data should be exported as a JSON file with fields that are straightforward to navigate and search, making the dataset accessible and usable for various applications.

This structured approach aims to minimize the load on the source server, streamline the data collection process, and ensure the sustainability and usability of the meteorite database.


In [None]:
t_zero = time.perf_counter()

scraper = cloudscraper.create_scraper()

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0",
}

behemoth = """https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=\
&nwas=&falls=&valids=&stype=contains&lrec=100000&map=ge&browse=&country=\
All&srt=name&categ=All&mblist=All&rect=\
&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=1"""

r = scraper.get(url=behemoth, headers=headers, timeout=10)

soup = BeautifulSoup(r.content, "html.parser")

t_end = time.perf_counter()

print(f"request time : {round(t_end - t_zero, 4)}")


### Let's test that it has a closing body tag and/or closing html tag :

(Let's use the tail of the html because its kindof a large file)

In [None]:
r.content

In [None]:
def check_closing_tags_bytes(html_bytes: bytes) -> tuple:
    """
    Check the last bytes of the HTML content for </body> and </html> tags.
    This function is to be launched on the request.content bytes object.
    
    Args:
        html_bytes (bytes): The HTML content as a bytes object.
    
    Returns:
        tuple: A tuple containing two booleans indicating the presence of </body> and </html> tags.
    """

    tail_content = html_bytes[-100:].decode("utf-8", errors="ignore").lower()
    
    closing_body_tag = "</body>" in tail_content
    closing_html_tag = "</html>" in tail_content

    return closing_body_tag, closing_html_tag



In [None]:
body_tag, html_tag = check_closing_tags_bytes(r.content)

print(f"Closing body tag found : {body_tag}")
print(f"Closing html tag found : {html_tag}")


## Selecting the main table (we just need the main table)
- We have confirmed correct EOF (end tags)
- We just need the maintable id of the html doc
- We'll free memory by removing the initial soup (we'll keep the r object just in case)
- We'll force a gc.collect() to improve perfs

In [None]:
# So we want the names, classes, and codes to get the URL
main_table = soup.find("table", id="maintable")

del soup
gc.collect()


## Now to extract the informations from the HTML :
- We know :
    - Each meteorite is registered as a span : `("span", class_="mname")` for soup
    - Fields orders : Name|Abbrev|Status|Fall|Year|Place|Type|Mass|MetBull|GoogleEarth|Notes
    - We need fields [0, 4, 5, 6, 7]
    - We can deduce the url of the meteorite by its code, with it we can reform the url


In [None]:
meteorites = main_table.find_all("span", class_="mname")
print(f"There are {meteorites.__len__()} meteorites in the html doc")


## Lets form a dataframe out of this :

In [None]:
base_url = "https://www.lpi.usra.edu/meteor/metbull.php?"
meteorite_data = []

for row in main_table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) > 8:  # Ensuring there are enough cells
        anchor = cells[0].find("a")
        if anchor and "href" in anchor.attrs:
            href = anchor["href"]
            code_start = href.find("code=") + len("code=")
            met_code = href[code_start:].split("&")[0] if "&" in href[code_start:] else href[code_start:]
            met_url = f"{base_url}code={met_code}"
        else:
            met_url = None
        
        # Extracting the required fields
        name = preprocessing.handle_name(cells[0].text.strip())
        year = preprocessing.handle_year(cells[4].text.strip())
        country = preprocessing.handle_country(cells[5].text.strip())
        met_type = preprocessing.handle_types(cells[6].text.strip())
        mass = preprocessing.handle_mass(cells[7].text.strip())

        # Append this meteorite's info as a dict
        meteorite_data.append({
            "name": name,
            "year": year,
            "country": country,
            "type": met_type,
            "mass": mass,
            "URL": met_url
        })

# Convert the list of dicts to a pandas DataFrame
df = pd.DataFrame(meteorite_data)


In [None]:
# handling NA years :
df["year"] = df["year"].astype("Int64")
df["year"] = df["year"].fillna(pd.NA)
# handling NA types
df["type"] = df["type"].replace("Unknown", pd.NA)
df["type"] = df["type"].replace("Unknown", pd.NA)
# handling NA countries
df["country"] = df["country"].replace("Unknown", pd.NA)


## Okay we have a dataset, let's try some simple commands :

In [None]:
display(df[df["type"] == "Iron, IIE-an"])


In [None]:
display(df[df["name"].str.lower().str.contains("catalina", na=False)])


In [None]:
display(df[df["type"].str.lower().str.contains("l3", na=False)])


In [None]:
display(df[df["country"].str.lower().str.contains("france", na=False)])


In [None]:
display(df.dtypes)
