In [1]:
import os
import zipfile
import os
import pandas as pd


In [2]:
# #FUNCTION TO EXTRACT ZIP FOLDERS
# def extract_all_zips(folder_path, extract_to=None):
#     """
#     Extracts all ZIP files in the specified folder.
    
#     Args:
#         folder_path (str): Path to the folder containing ZIP files
#         extract_to (str, optional): Folder to extract to. Defaults to same as ZIP file location.
#     """
#     if extract_to is None:
#         extract_to = folder_path
    
#     # Create extract_to directory if it doesn't exist
#     os.makedirs(extract_to, exist_ok=True)
    
#     # Loop through all files in the folder
#     for filename in os.listdir(folder_path):
#         if filename.endswith('.zip'):
#             zip_path = os.path.join(folder_path, filename)
            
#             try:
#                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#                     print(f"Extracting {filename}...")
#                     zip_ref.extractall(extract_to)
#                     print(f"Successfully extracted {filename}")
#             except Exception as e:
#                 print(f"Failed to extract {filename}: {str(e)}")

# if __name__ == "__main__":
#     # Get folder path from user input
#     folder_path = input("Enter the path to the folder containing ZIP files: ").strip()
    
#     # Validate path exists
#     if not os.path.exists(folder_path):
#         print("Error: The specified folder does not exist.")
#     else:
#         extract_all_zips(folder_path)

In [3]:
#Reads and cleans Met Éireann weather CSV files with automatic header detection
def read_met_eireann_csv(file_path):
    """Improved version that handles more variations"""
    skiprows = 1
    with open(file_path, 'r') as f:
        # Find the header line number
        for i, line in enumerate(f):
            if line.replace(' ','').startswith('year,month,'):
                skiprows = i
                break
    
    # Read with these parameters
    return pd.read_csv(
        file_path,
        skiprows=skiprows,
        na_values=[' ', ''],  # Handle empty values
        keep_default_na=True
    )

In [4]:
# Gathers all CSV file paths from the Weather_csv_counties directory and subfolders
path=os.getcwd()
input_folder = path + r'\Data\Weather_csv_counties'
csv_files = []
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

In [5]:
# Extracts the parent folder name from the first CSV file path
csv_files[0].split('\\')[-2]

'Carlow_csv'

In [6]:
# Reads all CSV files, adds county names from folder paths, and combines into a list of DataFrames
dfs = []
for csv_file in csv_files:
    df = read_met_eireann_csv(csv_file)
    df["county"] = csv_file.split('\\')[-2]
    dfs.append(df)

In [7]:
# Combines all county DataFrames into one and checks its dimensions
final_df = pd.concat(dfs)
final_df.shape

(169675, 17)

In [8]:
#Displays final_df table head
final_df.head()

Unnamed: 0,year,month,ind,rain,gdf,rd,wd,county,meant,maxtp,mintp,mnmax,mnmin,gmin,wdsp,maxgt,sun
0,1949,1,8.0,,,,,Carlow_csv,,,,,,,,,
1,1949,2,8.0,,,,,Carlow_csv,,,,,,,,,
2,1949,3,8.0,,,,,Carlow_csv,,,,,,,,,
3,1949,4,8.0,,,,,Carlow_csv,,,,,,,,,
4,1949,5,8.0,,,,,Carlow_csv,,,,,,,,,


In [9]:
#Displays final_df table tail
final_df.tail()

Unnamed: 0,year,month,ind,rain,gdf,rd,wd,county,meant,maxtp,mintp,mnmax,mnmin,gmin,wdsp,maxgt,sun
628,2024,9,0.0,104.1,36.1,18.0,13.0,Wicklow_csv,,,,,,,,,
629,2024,10,0.0,193.9,56.0,22.0,20.0,Wicklow_csv,,,,,,,,,
630,2024,11,0.0,203.2,53.8,25.0,17.0,Wicklow_csv,,,,,,,,,
631,2024,12,0.0,194.0,48.9,24.0,17.0,Wicklow_csv,,,,,,,,,
632,2025,1,0.0,223.8,46.3,20.0,15.0,Wicklow_csv,,,,,,,,,


In [10]:
# Filters data for years 2020+ and selects key weather columns
filtered_df = final_df[final_df['year'] >= 2020][['county', 'year', 'rain', 'meant', 'wdsp']].copy()
filtered_df.shape

(19395, 5)

In [11]:
#Displays filtered_df table
filtered_df.head()

Unnamed: 0,county,year,rain,meant,wdsp
852,Carlow_csv,2020,68.5,,
853,Carlow_csv,2020,145.6,,
854,Carlow_csv,2020,48.9,,
855,Carlow_csv,2020,47.9,,
856,Carlow_csv,2020,13.1,,


In [12]:
# Generates summary statistics of the filtered weather data
filtered_df.describe()

Unnamed: 0,year,rain,meant,wdsp
count,19395.0,19395.0,1195.0,1070.0
mean,2021.989533,112.32792,10.167782,8.342243
std,1.450195,71.487308,3.685577,2.158961
min,2020.0,3.5,2.8,3.5
25%,2021.0,60.5,7.1,6.8
50%,2022.0,99.3,9.4,8.1
75%,2023.0,146.6,13.45,9.4
max,2025.0,681.8,17.8,17.9


In [13]:
# Calculates average weather metrics per county per year
aggregated_df = filtered_df.groupby(['county', 'year']).mean()
aggregated_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rain,meant,wdsp
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Carlow_csv,2020,82.361667,10.258333,8.033333
Carlow_csv,2021,78.605,10.366667,6.916667
Carlow_csv,2022,82.992593,10.633333,7.275
Carlow_csv,2023,93.320833,11.141667,7.4
Carlow_csv,2024,75.085106,10.541667,7.216667


In [14]:
#Saves aggregated_df to county_average_data csv file
output_path = path + r'\Data\county_average_data.csv'
aggregated_df.to_csv(output_path, index=True)

In [15]:
# Read the CSV file
df = pd.read_csv(path + r'\Data\county_average_data.csv')
df.head()

Unnamed: 0,county,year,rain,meant,wdsp
0,Carlow_csv,2020,82.361667,10.258333,8.033333
1,Carlow_csv,2021,78.605,10.366667,6.916667
2,Carlow_csv,2022,82.992593,10.633333,7.275
3,Carlow_csv,2023,93.320833,11.141667,7.4
4,Carlow_csv,2024,75.085106,10.541667,7.216667


In [16]:
# Rename the columns
df.columns = ['COUNTY', 'Year', 'Avg_rain(mm)', 'Avg_temp (C)', 'Avg_wind(knots)']

In [17]:
# Process county names - convert to uppercase and remove '_csv'
df['COUNTY'] = df['COUNTY'].str.replace('_csv', '').str.upper()

In [18]:
df.head()

Unnamed: 0,COUNTY,Year,Avg_rain(mm),Avg_temp (C),Avg_wind(knots)
0,CARLOW,2020,82.361667,10.258333,8.033333
1,CARLOW,2021,78.605,10.366667,6.916667
2,CARLOW,2022,82.992593,10.633333,7.275
3,CARLOW,2023,93.320833,11.141667,7.4
4,CARLOW,2024,75.085106,10.541667,7.216667


In [23]:
# 4. Save the updated data to a new CSV file
output_path = path + r'\Data\county_yearly_weather.csv'
df.to_csv(output_path, index=False)


In [20]:
# Calculate averages for 2020-2024 and keep one row per county
df_avg = (
    df[df['Year'].between(2020, 2024)]  # Filter years 2020-2024
    .fillna(0)  
    .groupby('COUNTY', as_index=False)   # Group by county
    .mean(numeric_only=True)             # Calculate mean for numeric columns
    .round(2)
    .assign(Year='2020-2024')            # Add year label
    [['COUNTY', 'Year', 'Avg_rain(mm)', 'Avg_temp (C)', 'Avg_wind(knots)']]  # Reorder columns
)

# Result will contain one row per county with averaged values
df_avg.head()

Unnamed: 0,COUNTY,Year,Avg_rain(mm),Avg_temp (C),Avg_wind(knots)
0,CARLOW,2020-2024,82.47,10.59,7.37
1,CAVAN,2020-2024,105.91,9.97,5.94
2,CLARE,2020-2024,115.94,11.08,8.51
3,CORK,2020-2024,121.08,0.0,0.0
4,DONEGAL,2020-2024,131.27,10.42,10.15


In [24]:
# Save the processed data to a new CSV file
df_avg.to_csv(path + r'\Data\county_weather.csv', mode='w',index=False)


In [31]:
#Reading StationDetails.csv file and counting number of stations
file_path = os.path.join(path, 'Data', 'StationDetails.csv')
df_station = pd.read_csv(file_path, on_bad_lines='skip')
num_stations = df_station['station name'].nunique()
print(num_stations)

2080


In [36]:
df_station.head()

Unnamed: 0,county,station name,name,height(m),easting,northing,latitude,longitude,open year,close year
0,Antrim,5880,LH_RATHLIN_WEST,10,309200,451800,55.30083,-6.28028,2000,(null)
1,Carlow,4415,TULLOW (Waterworks),76,284700,173400,52.80528,-6.74306,1985,(null)
2,Carlow,2414,BORRIS G.S.,85,272400,150700,52.60278,-6.93056,1944,1991
3,Carlow,1214,CARLOW (SUGAR FACTORY),58,272200,178400,52.85139,-6.92778,1953,1960
4,Carlow,115,HACKETSTOWN RECTORY,182,297600,180500,52.86667,-6.55,1910,1944


In [37]:
# Reads and displays average weather data
file_path = os.path.join(path, 'Data', 'county_weather.csv')
df_avg_weather=pd.read_csv(file_path, on_bad_lines='skip')

df_avg_weather.head()

Unnamed: 0,COUNTY,Year,Avg_rain(mm),Avg_temp (C),Avg_wind(knots)
0,CARLOW,2020-2024,82.47,10.59,7.37
1,CAVAN,2020-2024,105.91,9.97,5.94
2,CLARE,2020-2024,115.94,11.08,8.51
3,CORK,2020-2024,121.08,0.0,0.0
4,DONEGAL,2020-2024,131.27,10.42,10.15


In [43]:
#Finds the counties with missing wind data
missing_wind = df_avg_weather[df_avg_weather['Avg_wind(knots)']==0]['COUNTY'].unique()
print(f"{missing_wind}: {len(missing_wind)}")


['CORK' 'KILDARE' 'KILKENNY' 'LAOIS' 'LEITRIM' 'LIMERICK' 'LONGFORD'
 'LOUTH' 'MONAGHAN' 'OFFALY' 'SLIGO' 'WATERFORD' 'WICKLOW']: 13


In [47]:
#Finds the counties with missing temp data
missing_temp = df_avg_weather[df_avg_weather['Avg_temp (C)']==0]['COUNTY'].unique()
print(f"{missing_temp}: {len(missing_temp)}")

['CORK' 'KILDARE' 'KILKENNY' 'LAOIS' 'LEITRIM' 'LIMERICK' 'LONGFORD'
 'LOUTH' 'MONAGHAN' 'OFFALY' 'WATERFORD' 'WICKLOW']: 12


In [48]:
#Finds the counties with missing rain data
missing_rain = df_avg_weather[df_avg_weather['Avg_rain(mm)']==0]['COUNTY'].unique()
print(f"{missing_rain}: {len(missing_rain)}")

[]: 0


In [68]:
#Highest and lowest rainfall by counties
sorted_rain=df_avg_weather.sort_values(by='Avg_rain(mm)', ascending=False)
print("Top 3 Highest Rainfall Values:")
print(sorted_rain[['COUNTY', 'Avg_rain(mm)']].head(3).to_string(index=False))
print("\nBottom 3 Lowest Rainfall Values:")
print(sorted_rain[['COUNTY', 'Avg_rain(mm)']].tail(3).to_string(index=False))


Top 3 Highest Rainfall Values:
 COUNTY  Avg_rain(mm)
  KERRY        162.06
   MAYO        139.73
DONEGAL        131.27

Bottom 3 Lowest Rainfall Values:
 COUNTY  Avg_rain(mm)
  MEATH         74.99
KILDARE         70.93
 DUBLIN         69.41


In [74]:
#Highest and lowest temp by counties
# Filter out 0 values first
df_filtered = df_avg_weather[df_avg_weather['Avg_temp (C)'] != 0]

# Highest temperatures (descending order)
sorted_temp_high = df_filtered.sort_values(by='Avg_temp (C)', ascending=False)
print("Top 3 Highest Temperature Values:")
print(sorted_temp_high[['COUNTY', 'Avg_temp (C)']].head(3).to_string(index=False))

# Lowest temperatures (ascending order)
sorted_temp_low = df_filtered.sort_values(by='Avg_temp (C)', ascending=True)
print("\nBottom 3 Lowest Temperature Values:")
print(sorted_temp_low[['COUNTY', 'Avg_temp (C)']].head(3).to_string(index=False))

Top 3 Highest Temperature Values:
 COUNTY  Avg_temp (C)
  KERRY         11.36
  CLARE         11.08
WEXFORD         10.68

Bottom 3 Lowest Temperature Values:
   COUNTY  Avg_temp (C)
    SLIGO          9.85
WESTMEATH          9.92
    CAVAN          9.97


In [75]:
#Highest and lowest wind by counties
# Filter out 0 and missing wind values first
df_wind = df_avg_weather[df_avg_weather['Avg_wind(knots)'].notna() & 
          (df_avg_weather['Avg_wind(knots)'] != 0)]

# Highest wind speeds (descending order)
sorted_wind_high = df_wind.sort_values(by='Avg_wind(knots)', ascending=False)
print("Top 3 Highest Wind Speeds:")
print(sorted_wind_high[['COUNTY', 'Avg_wind(knots)']].head(3).to_string(index=False))

# Lowest wind speeds (ascending order)
sorted_wind_low = df_wind.sort_values(by='Avg_wind(knots)', ascending=True)
print("\nBottom 3 Lowest Wind Speeds:")
print(sorted_wind_low[['COUNTY', 'Avg_wind(knots)']].head(3).to_string(index=False))

Top 3 Highest Wind Speeds:
 COUNTY  Avg_wind(knots)
DONEGAL            10.15
 DUBLIN             9.45
  KERRY             9.41

Bottom 3 Lowest Wind Speeds:
   COUNTY  Avg_wind(knots)
    CAVAN             5.94
WESTMEATH             6.12
ROSCOMMON             6.28


In [52]:
# Reads and displays yearly weather data
file_path = os.path.join(path, 'Data', 'county_yearly_weather.csv')
df_yearly_weather=pd.read_csv(file_path, on_bad_lines='skip')
df_yearly_weather.head()

Unnamed: 0,COUNTY,Year,Avg_rain(mm),Avg_temp (C),Avg_wind(knots)
0,CARLOW,2020,82.361667,10.258333,8.033333
1,CARLOW,2021,78.605,10.366667,6.916667
2,CARLOW,2022,82.992593,10.633333,7.275
3,CARLOW,2023,93.320833,11.141667,7.4
4,CARLOW,2024,75.085106,10.541667,7.216667


In [89]:
df_yearly_weather['Year'].value_counts()

2020    26
2021    26
2022    26
2023    26
2024    26
2025    26
Name: Year, dtype: int64

In [93]:
df_yearly_weather = df_yearly_weather[df_yearly_weather['Year'] != 2025]
df_yearly_weather.head(10)

Unnamed: 0,COUNTY,Year,Avg_rain(mm),Avg_temp (C),Avg_wind(knots)
0,CARLOW,2020,82.361667,10.258333,8.033333
1,CARLOW,2021,78.605,10.366667,6.916667
2,CARLOW,2022,82.992593,10.633333,7.275
3,CARLOW,2023,93.320833,11.141667,7.4
4,CARLOW,2024,75.085106,10.541667,7.216667
6,CAVAN,2020,113.22126,9.583333,6.5
7,CAVAN,2021,96.670455,9.925,5.575
8,CAVAN,2022,101.704545,9.966667,6.0
9,CAVAN,2023,126.21145,10.466667,5.8
10,CAVAN,2024,91.75,9.908333,5.816667


In [94]:
# Find county with maximum yearly rainfall and its value
df_yearly_weather.loc[df_yearly_weather['Avg_rain(mm)'].idxmax()]

COUNTY                  KERRY
Year                     2023
Avg_rain(mm)       181.258853
Avg_temp (C)           11.925
Avg_wind(knots)         9.525
Name: 45, dtype: object

In [103]:
# Find county with maximum yearly temperature and its value
df_yearly_weather.loc[df_yearly_weather['Avg_temp (C)'].idxmax()] 

COUNTY                  KERRY
Year                     2023
Avg_rain(mm)       181.258853
Avg_temp (C)           11.925
Avg_wind(knots)         9.525
Name: 45, dtype: object

In [101]:
# Find county with maximum yearly wind and its value
df_yearly_weather.loc[df_yearly_weather['Avg_wind(knots)'].idxmax()] 

COUNTY                DONEGAL
Year                     2020
Avg_rain(mm)       154.058482
Avg_temp (C)            10.05
Avg_wind(knots)     11.008333
Name: 24, dtype: object