In [4]:
pip install netCDF4 pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import netCDF4 as nc
import pandas as pd
import numpy as np
from netCDF4 import num2date

# Paths to the NetCDF files
file_path_0 = 'weather_data/data_0.nc'
file_path_1 = 'weather_data/data_1.nc'

# Load the first NetCDF file (data_0.nc)
dataset_0 = nc.Dataset(file_path_0)

# Extract variables from data_0.nc
valid_time_0 = dataset_0.variables['valid_time'][:]
latitude_0 = dataset_0.variables['latitude'][:]
longitude_0 = dataset_0.variables['longitude'][:]
t2m_0 = dataset_0.variables['t2m'][:]  # 2-meter temperature
d2m_0 = dataset_0.variables['d2m'][:]  # 2-meter dewpoint temperature

# Load the second NetCDF file (data_1.nc)
dataset_1 = nc.Dataset(file_path_1)

# Extract variables from data_1.nc
valid_time_1 = dataset_1.variables['valid_time'][:]
latitude_1 = dataset_1.variables['latitude'][:]
longitude_1 = dataset_1.variables['longitude'][:]
tp_1 = dataset_1.variables['tp'][:]  # Total precipitation

# Convert valid_time to datetime objects and extract the year
reference_date_0 = dataset_0.variables['valid_time'].units  # e.g., "seconds since 1970-01-01"
dates_0 = num2date(valid_time_0, reference_date_0)
valid_time_0_year = np.array([date.year for date in dates_0])

reference_date_1 = dataset_1.variables['valid_time'].units
dates_1 = num2date(valid_time_1, reference_date_1)
valid_time_1_year = np.array([date.year for date in dates_1])

# Find overlapping years
common_years = np.intersect1d(valid_time_0_year, valid_time_1_year)

# Print the number of overlapping years
print(f"Number of overlapping years: {len(common_years)}")
if len(common_years) == 0:
    raise ValueError("No overlapping years found between the two files!")

# Filter data_0.nc to include only common years
mask_0 = np.isin(valid_time_0_year, common_years)
t2m_0_filtered = t2m_0[mask_0]
d2m_0_filtered = d2m_0[mask_0]

# Filter data_1.nc to include only common years
mask_1 = np.isin(valid_time_1_year, common_years)
tp_1_filtered = tp_1[mask_1]

# Flatten the multidimensional arrays
time_flat = np.repeat(common_years, len(latitude_0) * len(longitude_0))
lat_flat = np.tile(np.repeat(latitude_0, len(longitude_0)), len(common_years))
lon_flat = np.tile(longitude_0, len(common_years) * len(latitude_0))

# Flatten the variables
t2m_flat = t2m_0_filtered.flatten()
d2m_flat = d2m_0_filtered.flatten()
tp_flat = tp_1_filtered.flatten()

# Ensure all arrays have the same length
min_length = min(len(time_flat), len(lat_flat), len(lon_flat), len(t2m_flat), len(d2m_flat), len(tp_flat))
time_flat = time_flat[:min_length]
lat_flat = lat_flat[:min_length]
lon_flat = lon_flat[:min_length]
t2m_flat = t2m_flat[:min_length]
d2m_flat = d2m_flat[:min_length]
tp_flat = tp_flat[:min_length]

# Verify the lengths again
print("Aligned lengths:")
print("Length of time_flat:", len(time_flat))
print("Length of lat_flat:", len(lat_flat))
print("Length of lon_flat:", len(lon_flat))
print("Length of t2m_flat:", len(t2m_flat))
print("Length of d2m_flat:", len(d2m_flat))
print("Length of tp_flat:", len(tp_flat))

# Create a pandas DataFrame
df = pd.DataFrame({
    'Year': time_flat,
    'Latitude': lat_flat,
    'Longitude': lon_flat,
    'T2M': t2m_flat,  # 2-meter temperature
    'D2M': d2m_flat,  # 2-meter dewpoint temperature
    'TP': tp_flat     # Total precipitation
})

# Export to CSV
output_csv_path = 'weather_processing.csv'
df.to_csv(output_csv_path, index=False)

print(f"Combined CSV file saved to {output_csv_path}")

Number of overlapping years: 17
Aligned lengths:
Length of time_flat: 17650080
Length of lat_flat: 17650080
Length of lon_flat: 17650080
Length of t2m_flat: 17650080
Length of d2m_flat: 17650080
Length of tp_flat: 17650080
Combined CSV file saved to weather_processing.csv


In [6]:
pip install geopy reverse_geocode


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Define input and output paths
input_csv_path = 'weather_processing.csv'  # Your input file path
output_csv_path = 'weather_processing_w_countries.csv'  # Your output file path

# Define the valid countries and their RegionCodes with centroids
valid_countries = {
    "Austria": {"code": "AT", "centroid": (47.5162, 14.5501)},
    "Belgium": {"code": "BE", "centroid": (50.5039, 4.4699)},
    "Bulgaria": {"code": "BG", "centroid": (42.7339, 25.4858)},
    "Cyprus": {"code": "CY", "centroid": (35.1264, 33.4299)},
    "Czechia": {"code": "CZ", "centroid": (49.8175, 15.4730)},
    "Germany": {"code": "DE", "centroid": (51.1657, 10.4515)},
    "Denmark": {"code": "DK", "centroid": (56.2639, 9.5018)},
    "Estonia": {"code": "EE", "centroid": (58.5953, 25.0136)},
    "Greece": {"code": "EL", "centroid": (39.0742, 21.8243)},
    "Spain": {"code": "ES", "centroid": (40.4637, -3.7492)},
    "Finland": {"code": "FI", "centroid": (61.9241, 25.7482)},
    "Hungary": {"code": "HU", "centroid": (47.1625, 19.5033)},
    "Ireland": {"code": "IE", "centroid": (53.1424, -7.6921)},
    "Italy": {"code": "IT", "centroid": (41.8719, 12.5674)},
    "Lithuania": {"code": "LT", "centroid": (55.1694, 23.8813)},
    "Luxembourg": {"code": "LU", "centroid": (49.8153, 6.1296)},
    "Latvia": {"code": "LV", "centroid": (56.8796, 24.6032)},
    "Malta": {"code": "MT", "centroid": (35.9375, 14.3754)},
    "Netherlands": {"code": "NL", "centroid": (52.1326, 5.2913)},
    "Poland": {"code": "PL", "centroid": (51.9194, 19.1451)},
    "Portugal": {"code": "PT", "centroid": (39.3999, -8.2245)},
    "Romania": {"code": "RO", "centroid": (45.9432, 24.9668)},
    "Sweden": {"code": "SE", "centroid": (60.1282, 18.6435)},
    "Slovenia": {"code": "SI", "centroid": (46.1512, 14.9955)},
    "Slovakia": {"code": "SK", "centroid": (48.6690, 19.6990)},
    "United Kingdom": {"code": "UK", "centroid": (55.3781, -3.4360)},
    "France": {"code": "FR", "centroid": (46.2276, 2.2137)},
    "Croatia": {"code": "HR", "centroid": (45.1000, 15.2000)},
    "Iceland": {"code": "IS", "centroid": (64.9631, -19.0208)},
}


# Precompute centroids as NumPy arrays
centroids = np.radians(np.array([info["centroid"] for info in valid_countries.values()]))
country_codes = np.array([info["code"] for info in valid_countries.values()])
country_names = np.array(list(valid_countries.keys()))

# Function to calculate Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    
    a = np.sin(dphi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

# Vectorized function to find the nearest country
def vectorized_find_nearest_country(latitudes, longitudes):
    points = np.radians(np.column_stack((latitudes, longitudes)))
    distances = haversine_distance(
        points[:, 0][:, None], points[:, 1][:, None],
        centroids[:, 0], centroids[:, 1]
    )
    nearest_indices = np.argmin(distances, axis=1)
    return country_codes[nearest_indices], country_names[nearest_indices]

# Define chunk size
chunk_size = 5000

# Initialize counters for debugging
total_rows_processed = 0
total_rows_filtered = 0
errors_encountered = 0

# Define European boundaries
europe_latitude_range = (34.0, 71.0)
europe_longitude_range = (-25.0, 40.0)

# Count total rows for progress bar
print("Counting total rows...")
total_rows = sum(1 for _ in open(input_csv_path)) - 1
print(f"Total rows to process: {total_rows}")

# Initialize progress bar
progress_bar = tqdm(total=total_rows, desc="Processing Rows", unit="row")

# Open the output file in write mode
output_file = open(output_csv_path, 'w')
header_written = False

try:
    for chunk_number, chunk in enumerate(pd.read_csv(input_csv_path, chunksize=chunk_size)):
        try:
            # Print debugging info for the chunk
            print(f"\nProcessing chunk {chunk_number + 1}")
            print(f"Chunk size before filtering: {len(chunk)}")
            
            # Check for NaN values before filtering
            nan_counts = chunk[['Latitude', 'Longitude']].isna().sum()
            print(f"NaN values in chunk: {nan_counts}")
            
            # Filter for Europe with additional error checking
            valid_coords = (
                chunk['Latitude'].notna() &
                chunk['Longitude'].notna() &
                (chunk['Latitude'] >= europe_latitude_range[0]) &
                (chunk['Latitude'] <= europe_latitude_range[1]) &
                (chunk['Longitude'] >= europe_longitude_range[0]) &
                (chunk['Longitude'] <= europe_longitude_range[1])
            )
            
            chunk = chunk[valid_coords]
            
            print(f"Chunk size after filtering: {len(chunk)}")
            total_rows_filtered += len(chunk)
            
            if chunk.empty:
                progress_bar.update(chunk_size)
                total_rows_processed += chunk_size
                continue
            
            # Apply the vectorized function to find the nearest country
            chunk['Country Code'], chunk['Country Name'] = vectorized_find_nearest_country(
                chunk['Latitude'].values, chunk['Longitude'].values
            )
            
            # Write the chunk to the output file
            if not header_written:
                chunk.to_csv(output_file, index=False)
                header_written = True
            else:
                chunk.to_csv(output_file, index=False, header=False)
            
            # Update counters
            total_rows_processed += len(chunk)
            progress_bar.update(len(chunk))
            
        except Exception as chunk_error:
            errors_encountered += 1
            print(f"\nError in chunk {chunk_number + 1}: {chunk_error}")
            print("Last 5 rows being processed:")
            print(chunk.tail())
            
            # Continue with next chunk instead of stopping
            continue

except Exception as e:
    print(f"\nCritical error encountered: {e}")
    raise

finally:
    # Print summary statistics
    print("\nProcessing Summary:")
    print(f"Total rows processed: {total_rows_processed}")
    print(f"Total rows filtered (in Europe): {total_rows_filtered}")
    print(f"Total errors encountered: {errors_encountered}")
    
    # Close the output file and progress bar
    output_file.close()
    progress_bar.close()

print("\nProcessing complete. Updated CSV file saved to output_with_countries.csv")

In [9]:
import pandas as pd

# Read the output file
df = pd.read_csv("weather_processing_w_countries.csv")

# Get country distribution
country_counts = df['Country Name'].value_counts()

# Calculate percentages
country_percentages = (country_counts / len(df) * 100).round(2)

# Combine counts and percentages
country_stats = pd.DataFrame({
    'Count': country_counts,
    'Percentage': country_percentages
})

# Sort by count in descending order
print("\nCountry Distribution:")
print(country_stats)

print(f"\nTotal rows in output: {len(df)}")


Country Distribution:
                Count  Percentage
Country Name                     
Finland         62713       15.38
Denmark         41769       10.24
Sweden          38131        9.35
Cyprus          32793        8.04
Romania         27387        6.72
Greece          17935        4.40
Italy           17816        4.37
Malta           16473        4.04
Bulgaria        16167        3.96
France          15232        3.74
Estonia         13702        3.36
Latvia          13447        3.30
Lithuania       12240        3.00
United Kingdom  11526        2.83
Poland           8483        2.08
Netherlands      8347        2.05
Spain            8194        2.01
Germany          7990        1.96
Luxembourg       6239        1.53
Czechia          5627        1.38
Hungary          5593        1.37
Croatia          5151        1.26
Belgium          4998        1.23
Slovakia         4114        1.01
Austria          3859        0.95
Slovenia         1887        0.46

Total rows in output: 40