In [1]:
import pandas as pd
import numpy as np

def merge_datasets_for_ml():
    # Read the weather data
    print("Reading weather data...")
    weather_df = pd.read_csv('output_with_countries.csv')
    
    # Read the leptospirosis data
    print("Reading leptospirosis data...")
    lepto_df = pd.read_csv('ECDC_surveillance_data_Leptospirosis.csv')
    
    # Clean leptospirosis data
    print("Preparing leptospirosis data...")
    lepto_df['Value'] = lepto_df['Value'].replace('-', np.nan)
    lepto_df['Value'] = pd.to_numeric(lepto_df['Value'], errors='coerce')
    lepto_df = lepto_df.rename(columns={
        'Time': 'Year',
        'RegionCode': 'Country Code',
        'Value': 'Leptospirosis_Rate'
    })
    
    # Create a mapping dictionary for each country-year combination
    print("Creating leptospirosis mapping...")
    lepto_mapping = {}
    for _, row in lepto_df.iterrows():
        key = (row['Year'], row['Country Code'])
        if pd.notna(row['Leptospirosis_Rate']):
            lepto_mapping[key] = row['Leptospirosis_Rate']
    
    # Function to get leptospirosis rate
    def get_lepto_rate(year, country_code):
        return lepto_mapping.get((year, country_code), np.nan)
    
    # Add leptospirosis rates to weather data
    print("Merging datasets...")
    weather_df['Year'] = weather_df['Year'].astype(int)
    weather_df['Leptospirosis_Rate'] = weather_df.apply(
        lambda row: get_lepto_rate(row['Year'], row['Country Code']), 
        axis=1
    )
    
    # Remove rows where we don't have leptospirosis data
    merged_df = weather_df.dropna(subset=['Leptospirosis_Rate'])
    
    # Add some potentially useful features
    merged_df['Temperature_Celsius'] = merged_df['T2M'] - 273.15  # Convert Kelvin to Celsius
    merged_df['Dew_Point_Celsius'] = merged_df['D2M'] - 273.15
    merged_df['Relative_Humidity'] = 100 * (np.exp((17.625 * merged_df['Dew_Point_Celsius']) / 
                                                  (243.04 + merged_df['Dew_Point_Celsius'])) / 
                                          np.exp((17.625 * merged_df['Temperature_Celsius']) / 
                                                (243.04 + merged_df['Temperature_Celsius'])))
    
    # Save to CSV
    print("Saving merged dataset...")
    merged_df.to_csv('ml_final_data.csv', index=False)
    
    # Print summary
    print("\nMerged Dataset Summary:")
    print(f"Total rows: {len(merged_df)}")
    print("\nFeatures available for ML:")
    for col in merged_df.columns:
        print(f"- {col}")
    
    # Print some statistics
    print("\nData points per country:")
    print(merged_df['Country Name'].value_counts())
    
    print("\nData points per year:")
    print(merged_df['Year'].value_counts().sort_index())
    
    return merged_df

if __name__ == "__main__":
    try:
        merged_data = merge_datasets_for_ml()
        print("\nData merge completed successfully!")
    except Exception as e:
        print(f"Error occurred: {e}")

Reading weather data...
Reading leptospirosis data...
Preparing leptospirosis data...
Creating leptospirosis mapping...
Merging datasets...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Temperature_Celsius'] = merged_df['T2M'] - 273.15  # Convert Kelvin to Celsius
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Dew_Point_Celsius'] = merged_df['D2M'] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Relative_Humidity'] = 100 * (np.exp((17.625 *

Saving merged dataset...

Merged Dataset Summary:
Total rows: 400595

Features available for ML:
- Year
- Latitude
- Longitude
- T2M
- D2M
- TP
- Country Code
- Country Name
- Leptospirosis_Rate
- Temperature_Celsius
- Dew_Point_Celsius
- Relative_Humidity

Data points per country:
Country Name
Finland           62713
Denmark           41769
Sweden            38131
Cyprus            32793
Romania           27387
Greece            17935
Italy             17816
Malta             16473
Bulgaria          16167
Estonia           13702
Latvia            13447
France            12544
Lithuania         12240
United Kingdom     8814
Poland             8483
Netherlands        8347
Spain              8194
Germany            7990
Luxembourg         6239
Czechia            5627
Hungary            5593
Belgium            4998
Slovakia           4114
Austria            3859
Croatia            3333
Slovenia           1887
Name: count, dtype: int64

Data points per year:
Year
2007    22790
2008    2279