# LUNAR ECLIPSES

In [1]:
import pandas as pd
import numpy as np
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
#open a chrome web browser
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service = service)

In [3]:
#determine the url to be scrapped
url = 'https://eclipse.gsfc.nasa.gov/lunar.html'
driver.get(url)

In [4]:
#inspect the element 1928 in the table in the bottom of the website and get the xpath and click on it
driver.find_element(By.XPATH, '//*[@id="singlecolumn"]/table[2]/tbody/tr[2]/td[3]/a').click()

In [5]:
#years = list(1921, 1931, 1941, 1951, 1961, 1971, 1981, 1991, 2001, 2011, 2021)
years = [1921, 1931, 1941, 1951, 1961, 1971, 1981, 1991, 2001, 2011, 2021]

#create an empty list to store the tables for each month of each year
table = []

#loop all the years and months to collect the tables
for year in years:
    # Manipulate the URL to get the desired year and month for each table
    url = f'https://eclipse.gsfc.nasa.gov/LEdecade/LEdecade{year}.html'
    try:
        #read the html table from the URL and treat the columns
        df_ecl = pd.read_html(url, header= [0])[0]
        df_ecl.columns = df_ecl.iloc[0]
        df_ecl = df_ecl[1:]
        df_ecl = df_ecl.drop(['TD of Greatest Eclipse', 'Saros Series', 'Umbral Magnitude', 'Eclipse Duration',
                            'Geographic Region of Eclipse Visibility'], axis = 1)
        df_ecl = df_ecl.rename(columns = {'Calendar Date': 'Date', 'Eclipse Type': 'Lunar Eclipse Intensity Type'})
        df_ecl = df_ecl.dropna(axis=1, how='all')
        df_ecl = df_ecl[df_ecl['Date'] >= '1928-01-01']
        
        table.append(df_ecl)
    
        df_ecl['Date'] = pd.to_datetime(df_ecl['Date'], format='%Y %b %d')
        
    except Exception as e:
        print(f"Error: Unable to read {url} erro: {e}")

#concatenate the table
df_concat_ecl = pd.concat(table, ignore_index = True)        

In [6]:
df_concat_ecl

Unnamed: 0,Date,Lunar Eclipse Intensity Type
0,1929-05-23,Penumbral
1,1929-11-17,Penumbral
2,1930-04-13,Partial
3,1930-10-07,Partial
4,1931-04-02,Total
...,...,...
228,2028-12-31,Total
229,2029-06-26,Total
230,2029-12-20,Total
231,2030-06-15,Partial


In [7]:
date_nan = df_concat_ecl['Date'].isna().any()
ecl_nan = df_concat_ecl['Lunar Eclipse Intensity Type'].isna().any()
print(f"Date column has NaN: {date_nan}\nLunar Eclipse Intensity Type column has NaN: {ecl_nan}")

#check if there are numeric digits in the columns
ecl_digits = df_concat_ecl['Lunar Eclipse Intensity Type'].str.contains(r'\d').any()
print(f"Lunar Eclipse Intensity Type column has numeric digits: {ecl_digits}")

#check if there are spcial characters in the columns
ecl_special = df_concat_ecl['Lunar Eclipse Intensity Type'].str.contains(r'[!@#$%^&*(),.?":{}|<>]').any()
print(f"Lunar Eclipse Intensity Type column has special characters: {ecl_special}")

Date column has NaN: False
Lunar Eclipse Intensity Type column has NaN: False
Lunar Eclipse Intensity Type column has numeric digits: False
Lunar Eclipse Intensity Type column has special characters: False


In [8]:
df_concat_ecl.to_csv(r'C:\Users\PC\Desktop\Ironhack\WR_Ironhack_Projects\Stock&Moon\lunar_eclipses.csv', index = False)