In [None]:
# !pip install requests
# !pip install ftplib

In [None]:
### import library
import datetime
import time
from ftplib import FTP
import pandas as pd
import io
import os

In [None]:
username = 'anonymous'
password = 'your email address'

# Set start and end time, to filter last 5 years of files
today = datetime.datetime.now()
starttime = today.replace(day=1) - datetime.timedelta(days=1825)  # last 5 years
endtime = today.replace(day=1) - datetime.timedelta(days=1)  # last month

# Generate list of YYYY-MM dates
date_list = []
current_datetime = starttime
while current_datetime <= endtime:
    date_list.append(current_datetime.strftime('%Y%m'))
    current_datetime += datetime.timedelta(days=31)  # Add one month

print("List of YYYY-MM dates between start and end time:")
print(date_list)

# Function to get observation locations from FTP server
def get_observation_location(ftp_server, ftp_directory):
    with FTP(ftp_server) as ftp:
        ftp.login(username, password)
        ftp.cwd(ftp_directory)
        observe_locations_list = ftp.nlst()
        return observe_locations_list

# Function to download files from FTP server
def download_files_from_ftp(ftp_server, ftp_directory, local_directory, observe_location, date_list):
    with FTP(ftp_server) as ftp:
        ftp.login(username, password)
        ftp_directory_location = ftp_directory + observe_location + '/'
        print(f"Attempting to change directory to: {ftp_directory_location}")

        try:
            ftp.cwd(ftp_directory_location)
            print(f"Successfully changed directory to: {ftp_directory_location}")
        except Exception as e:
            print(f"Error: Failed to change directory to {ftp_directory_location}. Exception: {e}")
            return  # Exit if the directory cannot be changed

        filenames = [f"{observe_location}-{date}.csv" for date in date_list]
                    
        for filename in filenames:
            remote_filepath = filename
            local_filepath = os.path.join(local_directory, filename)

            try:
                with open(local_filepath, "wb") as local_file:
                    ftp.retrbinary(f"RETR {remote_filepath}", local_file.write)   
                    print(f"File '{remote_filepath}' downloaded to '{local_filepath}'")

            except Exception as e:
                print(f"Error downloading file '{remote_filepath}': {e}")

# Define FTP server details
ftp_server = "ftp.bom.gov.au"
ftp_directory = "/anon/gen/clim_data/IDCKWCDEA0/tables/sa/"

# Define local directory to save files
local_directory = "C://Users//rines//Capstone//Datasets"

# Ensure the local directory exists
os.makedirs(local_directory, exist_ok=True)

# Get observation locations from FTP server
observe_locations = get_observation_location(ftp_server, ftp_directory)

# Download files from FTP server
for observe_location in observe_locations:
    download_files_from_ftp(ftp_server, ftp_directory, local_directory, observe_location, date_list)
    time.sleep(20)


In [None]:
# Define the local directory containing the CSV files
local_directory = "C://Users//rines//Capstone//Datasets"
output_directory = "C://Users//rines//Capstone//silver"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get the list of files in the local directory
filelist = os.listdir(local_directory)

# Initialize a dictionary to store DataFrames for each location
location_dfs = {}

# Iterate over each file in the directory
for filename in filelist:
    # Ignore files that don't match the expected pattern (e.g., avoid non-CSV files)
    if not filename.endswith('.csv'):
        print(f"Skipping non-CSV file: {filename}")
        continue
    
    locationname = filename[:-11]  # Extract location name from the filename
    
    # Read the CSV file
    temp = pd.read_csv(os.path.join(local_directory, filename), skiprows=13, encoding='utf-8', header=None)
    
    # Define the column names
    columnname = ['station_name', 'date', 'ev_transpiration', 'rain', 'pan_ev', 'max_temp', 'min_temp', 'max_humid', 'min_humid', 'wind', 'solar']
    temp.columns = columnname
    
    # Filter out rows that contain 'totals'
    temp = temp[~temp['station_name'].str.contains('Totals:', case=False, na=False)]
    
    # If the location already has data, append to it; otherwise, create a new DataFrame
    if locationname in location_dfs:
        location_dfs[locationname] = pd.concat([location_dfs[locationname], temp], ignore_index=True)
    else:
        location_dfs[locationname] = temp

# Save the merged DataFrame for each location to the output directory
for locationname, df in location_dfs.items():
    output_filepath = os.path.join(output_directory, f"{locationname}.csv")
    df.to_csv(output_filepath, sep=',', encoding='utf-8', index=False)
    print(f"Processed and saved file: {output_filepath}")


In [None]:
## change directory and get list of file
os.chdir("C://Users//rines//Capstone//silver")

filelist = os.listdir()

df = pd.DataFrame()
for i in filelist:
    print(i)
    temp = pd.read_csv(i, encoding='utf-8', index_col = 0)
    df = pd.concat([df, temp], ignore_index=False)
    del temp

filepath = f'C://Users//rines//Capstone//All_Sa_weather_5_years.csv'
        
df.to_csv(filepath, sep=',', encoding='utf-8')