# SOLAR ECLIPSES

In [1]:
import pandas as pd
import numpy as np
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
#open a chrome web browser
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service = service)

In [3]:
#determine the url to be scrapped
url = 'https://eclipse.gsfc.nasa.gov/solar.html'
driver.get(url)

In [4]:
#inspect the element 1928 in the table in the bottom of the website and get the xpath and click on it
driver.find_element(By.XPATH, '//*[@id="singlecolumn"]/table[3]/tbody/tr[2]/td[3]/a').click()

In [5]:
#list of twenty year table links
years = [1921, 1931, 1941, 1951, 1961, 1971, 1981, 1991, 2001, 2011, 2021]

#create an empty list to store the tables for each month of each year
table = []

#loop all the years and months to collect the tables
for year in years:
    # Manipulate the URL to get the desired year and month for each table
    url = f'https://eclipse.gsfc.nasa.gov/SEdecade/SEdecade{year}.html'
    try:
        #read the html table from the URL and treat the columns
        df_ecl_s = pd.read_html(url, header= [0])[0]
        df_ecl_s.columns = df_ecl_s.iloc[0]
        df_ecl_s = df_ecl_s[1:]
        df_ecl_s = df_ecl_s.drop(index = 1)
        df_ecl_s = df_ecl_s.drop(['TD of Greatest Eclipse', 'Saros Series', 'Eclipse Magnitude', 'Central Duration',
                                 'Geographic Region of Eclipse Visibility'], axis = 1)
        df_ecl_s = df_ecl_s.rename(columns = {'Calendar Date': 'Date', 'Eclipse Type': 'Solar Eclipse Intensity Type'})
        df_ecl_s = df_ecl_s.dropna(axis=1, how = 'all')
        df_ecl_s = df_ecl_s[df_ecl_s['Date'] >= '1928-01-01']
        
        table.append(df_ecl_s)
    
        df_ecl_s['Date'] = pd.to_datetime(df_ecl_s['Date'], format='%Y %b %d')
        
    except Exception as e:
        print(f"Error: Unable to read {url} erro: {e}")

#concatenate the table
df_concat_ecl_s = pd.concat(table, ignore_index = True)    

In [6]:
df_concat_ecl_s

Unnamed: 0,Date,Solar Eclipse Intensity Type
0,1929-05-09,Total
1,1929-11-01,Annular
2,1930-04-28,Hybrid
3,1930-10-21,Total
4,1931-04-18,Partial
...,...,...
223,2029-06-12,Partial
224,2029-07-11,Partial
225,2029-12-05,Partial
226,2030-06-01,Annular


In [7]:
date_nan = df_concat_ecl_s['Date'].isna().any()
ecl_s_nan = df_concat_ecl_s['Solar Eclipse Intensity Type'].isna().any()
print(f"Date column has NaN: {date_nan}\nSolar Eclipse Intensity Type column has NaN: {ecl_s_nan}")

#check if there are numeric digits in the columns
ecl_s_digits = df_concat_ecl_s['Solar Eclipse Intensity Type'].str.contains(r'\d').any()
print(f"Solar Eclipse Intensity Type column has numeric digits: {ecl_s_digits}")

#check if there are spcial characters in the columns
ecl_s_special = df_concat_ecl_s['Solar Eclipse Intensity Type'].str.contains(r'[!@#$%^&*(),.?":{}|<>]').any()
print(f"Solar Eclipse Intensity Type column has special characters: {ecl_s_special}")

Date column has NaN: False
Solar Eclipse Intensity Type column has NaN: False
Solar Eclipse Intensity Type column has numeric digits: False
Solar Eclipse Intensity Type column has special characters: False


In [8]:
df_concat_ecl_s.to_csv(r'C:\Users\PC\Desktop\Ironhack\WR_Ironhack_Projects\Stock&Moon\solar_eclipses.csv', index = False)