# Moon Phases

In [None]:
import pandas as pd
import numpy as np
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [None]:
#open a chrome web browser
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service = service)

In [None]:
#determine the url to be scrapped
url = 'https://mooncalendar.astro-seek.com/moon-phases-calendar-january-2023'
driver.get(url)

In [None]:
#inspect the element 1928 in the table in the bottom of the website and get the xpath and click on it
#**note that it is not automated. The user must to copy the xpath and paste here everything time that the code will run
driver.find_element(By.XPATH, '/html/body/div[2]/div[1]/div[4]/div[1]/div[5]/div[7]/div[2]/div/div[2]/a[29]').click()

In [None]:
#find the table
#table_moon = driver.find_element('xpath', '//*[@id="tab1"]/div/table')

In [None]:
months_url = pd.Series(['january','february','march','april','may','june','july','august','september','october','november','december'],name = 'month')

#create a series with the range of years that will be extracted
years_url = pd.Series(list(range(1928,2024)), name = 'year')

#merge both series
df = pd.merge(months_url, years_url, how = 'cross')

#manipulate the url to get the desired year and month from each table
lista_url = ('https://mooncalendar.astro-seek.com/moon-phases-calendar-' + df['month']+'-'+df['year'].astype('str')).to_list()

#empty list to store the the maniputaltion and treatment
lista_df =[]

#loop to run the the same treatment for all collected tables
for url in lista_url:
    df_moon = pd.read_html(url, header = [0])[0]
    df_moon['year'] = lista_url[0][-4:]
    df_moon['Date'] = pd.to_datetime(df_moon['Date.1'] + ' ' + url[-4:])
    df_moon = df_moon.drop(['Date.1', 'Moon Phase(Lunar Phase)', 'Organs influenced by the Moon Sign ------Surgery', 'Organs influenced by the Moon Sign ------Surgery.1'], axis = 1)
    df_moon = df_moon.rename(columns = {'Moon Phase(Lunar Phase).1': 'Moon Phase'})
    df_moon['Moon Phase'] = df_moon['Moon Phase'].str.replace(r'\s*at.*', '')    
    df_moon['Moon Sign'] = df_moon['Moon Sign'].str.replace(r'^.*?(\d.*)', r'\1')
    df_moon['Moon Sign'] = df_moon['Moon Sign'].str.replace(r'[\d:]+', '')
    df_moon = df_moon.drop(['year'], axis = 1)
    df_moon[['Moon Phase', 'Moon Sign']] = df_moon[['Moon Phase', 'Moon Sign']].apply(lambda x: x.str.lower())
    df_moon = df_moon.sort_values(by = 'Date', ignore_index = True)
    
    lista_df.append(df_moon)
    
#concat the list with the dataframe
df_concat = pd.concat(lista_df, ignore_index = True)

#organize the df by Date column
df_concat_moon = df_concat_moon.sort_values(by = 'Date', ignore_index = True)



In [None]:
df_concat_moon.head(50)

In [None]:
#check if there are NaN in in the columns
date_nan = df_concat_moon['Date'].isna().any()
mp_nan = df_concat_moon['Moon Phase'].isna().any()
ms_nan = df_concat_moon['Moon Phase'].isna().any()
print(f"Date column has NaN: {date_nan}\nMoon Phase column has NaN: {mp_nan}\nMoon Sign column has NaN: {ms_nan}")

#check if there are numeric digits in the columns
mp_digits = df_concat_moon['Moon Phase'].str.contains(r'\d').any()
ms_digits = df_concat_moon['Moon Sign'].str.contains(r'\d').any()
print(f"Moon Phase column has numeric digits: {mp_digits}\nMoon Sign column has numeric digits: {ms_digits}")

#check if there are spcial characters in the columns
mp_special = df_concat_moon['Moon Phase'].str.contains(r'[!@#$%^&*(),.?":{}|<>]').any()
ms_special = df_concat_moon['Moon Sign'].str.contains(r'[!@#$%^&*(),.?":{}|<>]').any()
print(f"Moon Phase column has special characters: {mp_special}\nMoon Sign column has special characters: {ms_special}")

In [None]:
#save as csv file
df_concat_moon.to_csv(r'C:\Users\PC\Desktop\Ironhack\WR_Ironhack_Projects\Stock&Moon\moon_phases.csv', index = False)

In [None]:
file = open(r'C:\Users\PC\Desktop\Ironhack\WR_Ironhack_Projects\Stock&Moon\moon_phases.csv')
df_moon = pd.read_csv(file)

In [None]:
for row in range(0, len(df_moon), 7):
    df_moon = df_moon.drop(df_moon.index[row+7:row+9], errors='ignore')

# reset the index of the dataframe
df_moon = df_moon.reset_index(drop=True)