In [57]:
# Main notebook for the project
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd

def extract_station_links(url):
    urls = []
    try:
        response = requests.get(url)
        if response.ok:
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', text='View data')
            for link in links:
                urls.append(link['href'])
        else:
            print(f"Error accessing page: Status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Error during requests to {url} : {str(e)}")
    
    return urls

def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)


def download_file(url, folder):

    # Extract the file name from the URL
    file_name = url.split('/')[-1]

    # Create the full path for the file
    file_path = os.path.join(folder, file_name)
    
    # Download and save the file
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")

def find_data_start_row(file_path):
    with open(file_path, 'r') as file:
        for i, line in enumerate(file):
            # Check if the line starts with space, indicating the start of the data
            if line.startswith("   "):
                return i  # Return the row number for the data start
    return 0  # Return 0 if no data start row is found

def txt_to_csv(file_path, output_folder):

    start_row = find_data_start_row(file_path)

    df = pd.read_csv(file_path, skiprows=start_row, delim_whitespace=True, usecols=[0, 1, 2, 3, 4, 5, 6])
    df.reset_index(drop=True, inplace=True)

    file_name = os.path.basename(file_path)
    output_file = os.path.join(output_folder, file_name.replace('.txt', '.csv'))
  
    df.to_csv(output_file, index=False)

    return output_folder

def download_main_dataset_1(station_url, output_folder_txt):
    # Extract CSV links from the given station URL
    csv_links = extract_station_links(station_url)

    # Create directories for TXT and CSV files if they don't exist
    create_folder_if_not_exists(output_folder_txt)

    # Download and convert each file in csv_links
    for link in csv_links:
        download_file(link, output_folder_txt)

def convert_txts_to_csvs(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    create_folder_if_not_exists(output_folder)
    # Convert each file in the input folder to CSV
    for file in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file)
        if file_path.endswith('.txt'):
            txt_to_csv(file_path, output_folder)

def clean_and_rename_csv_in_folder(input_folder, output_folder):
    create_folder_if_not_exists(output_folder)
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            df = df.drop(0).reset_index(drop=True)
            df.columns = ['yyyy', 'mm', 'tmax (degC)', 'tmin (degC)', 'af (days)', 'rain (mm)', 'sun (hours)']
            df.replace('---', pd.NA, inplace=True)
            output_file_path = os.path.join(output_folder, file_name)
            df.to_csv(output_file_path, index=False)


# Download and convert the files
output_folder_txt = '../data/stations_txt'
output_folder_csv = '../data/stations_csv'
output_folder_csc_clean = '../data/stations_csv_clean'
station_url = 'https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data'

#download_main_dataset_1(station_url, output_folder_txt)
#convert_txts_to_csvs(output_folder_txt, output_folder_csv)
clean_and_rename_csv_in_folder(output_folder_csv, output_folder_csc_clean)