# Supervised Learning

In [None]:
from IPython import get_ipython
from IPython.display import display
import rasterio
import geopandas as gpd
from rasterio.mask import mask
import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.base import BaseEstimator, TransformerMixin
from lazypredict.Supervised import LazyRegressor
from sklearn.utils import shuffle
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import xgboost as xgb
import shap

In [None]:
%pip install rasterio
%pip install lazypredict
%pip install catboost

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3
Collecting lazypredict
  Down

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the shapefile
shapefile = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/ne_10m_admin_0_countries.shp"
gdf = gpd.read_file(shapefile)

# Display all the unique country names in the 'ADMIN' column
print(gdf['ADMIN'].unique())

['Indonesia' 'Malaysia' 'Chile' 'Bolivia' 'Peru' 'Argentina'
 'Dhekelia Sovereign Base Area' 'Cyprus' 'India' 'China' 'Israel'
 'Palestine' 'Lebanon' 'Ethiopia' 'South Sudan' 'Somalia' 'Kenya' 'Malawi'
 'United Republic of Tanzania' 'Syria' 'Somaliland' 'France' 'Suriname'
 'Guyana' 'South Korea' 'North Korea' 'Morocco' 'Western Sahara'
 'Costa Rica' 'Nicaragua' 'Republic of the Congo'
 'Democratic Republic of the Congo' 'Bhutan' 'Ukraine' 'Belarus' 'Namibia'
 'South Africa' 'Saint Martin' 'Sint Maarten' 'Oman' 'Uzbekistan'
 'Kazakhstan' 'Tajikistan' 'Lithuania' 'Brazil' 'Uruguay' 'Mongolia'
 'Russia' 'Czechia' 'Germany' 'Estonia' 'Latvia' 'Norway' 'Sweden'
 'Finland' 'Vietnam' 'Cambodia' 'Luxembourg' 'United Arab Emirates'
 'Belgium' 'Georgia' 'North Macedonia' 'Albania' 'Azerbaijan' 'Kosovo'
 'Turkey' 'Spain' 'Laos' 'Kyrgyzstan' 'Armenia' 'Denmark' 'Libya'
 'Tunisia' 'Romania' 'Hungary' 'Slovakia' 'Poland' 'Ireland'
 'United Kingdom' 'Greece' 'Zambia' 'Sierra Leone' 'Guinea' 'Liberia

In [None]:
# List of countries to keep (you can replace these with the ones you need)
countries_to_keep = [
    'Afghanistan', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Azerbaijan', 'Bangladesh',
    'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil', 'Burkina Faso', 'Burundi',
    'Cabo Verde', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'China',
    'Colombia', 'Comoros', 'Republic of the Congo', 'Costa Rica', 'Ivory Coast',
    'North Korea', 'Democratic Republic of the Congo', 'Djibouti', 'Dominican Republic',
    'Ecuador', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'eSwatini', 'Ethiopia',
    'France', 'Gabon', 'Gambia', 'Georgia', 'Ghana', 'Guatemala', 'Guinea',
    'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'India', 'Indonesia', 'Iran', 'Iraq',
    'Kazakhstan', 'Kenya', 'Kyrgyzstan', 'Laos', 'Liberia', 'Madagascar', 'Malawi',
    'Malaysia', 'Mali', 'Mauritania', 'Mexico', 'Morocco', 'Mozambique',
    'Myanmar', 'Namibia', 'Nepal', 'Nicaragua', 'Niger', 'Nigeria', 'Oman', 'Pakistan',
    'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'South Korea',
    'Rwanda', 'São Tomé and Principe', 'Saudi Arabia', 'Senegal', 'Sierra Leone',
    'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan', 'Sri Lanka', 'Sudan',
    'Suriname', 'Syria', 'Tajikistan', 'Thailand', 'East Timor', 'Togo', 'Turkey',
    'Turkmenistan', 'Uganda', 'United Arab Emirates', 'United Republic of Tanzania',
    'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'
]

# Filter the GeoDataFrame to only include these countries
filtered_gdf = gdf[gdf['ADMIN'].isin(countries_to_keep)]

# Display the filtered GeoDataFrame
print(filtered_gdf)


          featurecla  scalerank  LABELRANK             SOVEREIGNT SOV_A3  \
0    Admin-0 country          0          2              Indonesia    IDN   
1    Admin-0 country          0          3               Malaysia    MYS   
3    Admin-0 country          0          3                Bolivia    BOL   
4    Admin-0 country          0          2                   Peru    PER   
5    Admin-0 country          0          2              Argentina    ARG   
..               ...        ...        ...                    ...    ...   
219  Admin-0 country          3          6                Comoros    COM   
220  Admin-0 country          3          6  São Tomé and Principe    STP   
221  Admin-0 country          3          4             Cabo Verde    CPV   
236  Admin-0 country          1          3        Solomon Islands    SLB   
243  Admin-0 country          1          4                Vanuatu    VUT   

     ADM0_DIF  LEVEL               TYPE TLC                  ADMIN  ...  \
0           

In [None]:
# Define the path where you want to save the filtered shapefile
output_shp_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/filtered_countries.shp'  # Update the path as needed

# Save the filtered GeoDataFrame to a new Shapefile
filtered_gdf.to_file(output_shp_path)

print(f"Filtered shapefile saved to {output_shp_path}")

Filtered shapefile saved to /content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/filtered_countries.shp


In [None]:
# Path to the filtered shapefile
shapefile_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/filtered_countries.shp'

# Load the shapefile using geopandas
shapefile = gpd.read_file(shapefile_path)

# Ensure the shapefile's CRS matches the TIF image CRS (you'll verify this in the next step)
print(shapefile.crs)  # Output the CRS of the shapefile

EPSG:4326


In [None]:
# Drop the 'POP_EST' field from the GeoDataFrame
filtered_gdf = filtered_gdf.drop(columns=['POP_EST'])

# Now save the shapefile without the population field
filtered_gdf.to_file(output_shp_path)

KeyError: "['POP_EST'] not found in axis"

In [None]:
shapefile_path = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/filtered_countries.shp"
min_temp_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/RawTIF file/Min Temp"
max_temp_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/RawTIF file/Max Temp"
precip_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/RawTIF file/Precipitation"
output_base_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/"

# Load shapefile
shapefile = gpd.read_file(shapefile_path)

# Ensure CRS is consistent
shapefile = shapefile.to_crs("EPSG:4326")  # Assuming the TIF files are in EPSG:4326

# Function to clip and save TIF files
def clip_tif_files(input_folder, output_folder, shapefile):
    os.makedirs(output_folder, exist_ok=True)
    tif_files = glob.glob(os.path.join(input_folder, "*.tif"))

    for tif_file in tif_files:
        with rasterio.open(tif_file) as src:
            # Clip the TIF using the shapefile's geometry
            clipped, clipped_transform = mask(src, shapefile.geometry, crop=True)
            meta = src.meta.copy()
            meta.update({
                "driver": "GTiff",
                "height": clipped.shape[1],
                "width": clipped.shape[2],
                "transform": clipped_transform
            })

        # Save clipped TIF
        output_file = os.path.join(output_folder, os.path.basename(tif_file))
        with rasterio.open(output_file, "w", **meta) as dest:
            dest.write(clipped)
        print(f"Saved clipped file: {output_file}")

# Define separate output folders for MinTemp, MaxTemp, and Precipitation
min_temp_output_folder = os.path.join(output_base_folder, "MinTemp")
max_temp_output_folder = os.path.join(output_base_folder, "MaxTemp")
precip_output_folder = os.path.join(output_base_folder, "Precip")

# Clip files from all folders
print("Clipping Min Temp files...")
clip_tif_files(min_temp_folder, min_temp_output_folder, shapefile)

print("Clipping Max Temp files...")
clip_tif_files(max_temp_folder, max_temp_output_folder, shapefile)

print("Clipping Precipitation files...")
clip_tif_files(precip_folder, precip_output_folder, shapefile)

print("All files clipped successfully!")

Clipping Min Temp files...
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2002-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2001-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2001-02.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2006-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2000-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2002-03.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2005-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp/wc2.1_10m_tmin_2004-01.tif
Saved clipped file: /content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/

In [None]:
# Paths to the clipped TIF folders and shapefile
clipped_min_temp_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MinTemp"
clipped_max_temp_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/MaxTemp"
clipped_precip_folder = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/Precip"
shapefile_path = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/filtered_countries.shp"

# Load the malaria-affected countries shapefile
countries_gdf = gpd.read_file(shapefile_path)

# Function to extract pixel values for each country
def extract_per_country(folder, variable_name, countries_gdf):
    tif_files = glob.glob(os.path.join(folder, "*.tif"))
    data_list = []

    for tif_file in tif_files:
        with rasterio.open(tif_file) as src:
            for _, country in countries_gdf.iterrows():
                country_name = country["ADMIN"]  # Replace "ADMIN" with the column that stores country names
                geometry = [country.geometry]

                # Mask the TIF file using the country's geometry
                try:
                    out_image, out_transform = mask(src, geometry, crop=True)
                    array = out_image[0]  # First band

                    # Exclude no-data values
                    array = array[array != src.nodata]

                    if array.size > 0:  # If there are valid pixel values
                        mean_value = np.nanmean(array)
                        median_value = np.nanmedian(array)
                        sum_value = np.nansum(array)

                        # Add to data list
                        data_list.append({
                            "File": os.path.basename(tif_file),
                            "Country": country_name,
                            "Variable": variable_name,
                            "Mean": mean_value,
                            "Median": median_value,
                            "Sum": sum_value
                        })
                except ValueError:
                    # Skip countries where geometry doesn't intersect with the TIF extent
                    print(f"No data for {country_name} in {os.path.basename(tif_file)}")

    # Convert to DataFrame
    return pd.DataFrame(data_list)

# Extract values for Min Temp, Max Temp, and Precipitation
print("Processing Min Temp TIF files...")
min_temp_data = extract_per_country(clipped_min_temp_folder, "MinTemp", countries_gdf)

print("Processing Max Temp TIF files...")
max_temp_data = extract_per_country(clipped_max_temp_folder, "MaxTemp", countries_gdf)

print("Processing Precipitation TIF files...")
precip_data = extract_per_country(clipped_precip_folder, "Precipitation", countries_gdf)

# Combine all data into a single DataFrame
all_country_data = pd.concat([min_temp_data, max_temp_data, precip_data], ignore_index=True)

# Save to a CSV file
output_csv = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/tif_summary_per_country.csv"
all_country_data.to_csv(output_csv, index=False)

print(f"Summary values per country saved to: {output_csv}")

In [None]:
# Load the climate data
climate_data = pd.read_csv('/content/drive/MyDrive/FINAL PROJECT DATA SCI/ClippedTIF/tif_summary_per_country.csv')

# Check the data structure
print(climate_data.head())

In [None]:
# Extract year from the file name
climate_data['Year'] = climate_data['File'].str.extract(r'(\d{4})').astype(int)

print(climate_data[['File', 'Year']].head())  # Verify the Year column

In [None]:
# Aggregate data by Country and Year
climate_yearly = climate_data.groupby(['Country', 'Year', 'Variable']).agg({
    'Mean': 'mean',   # Average monthly mean value for the year
    'Median': 'mean', # Average monthly median value for the year
    'Sum': 'sum'      # Total sum across months for the year
}).reset_index()

print(climate_yearly.head())

In [None]:
# Pivot data for each variable
climate_pivot = climate_yearly.pivot_table(
    index=['Country', 'Year'],
    columns='Variable',
    values=['Mean', 'Median', 'Sum']
).reset_index()

# Flatten multi-level columns
climate_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in climate_pivot.columns]

print(climate_pivot.head())  # Check the pivoted structure

output_csv = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/Min&Max Temp and Precipitation Dataset/Mean_Median_Sum_Summmary.csv"

# Save the climate data to the CSV file
climate_pivot.to_csv(output_csv, index=False)

In [None]:
# Melt the pivoted climate data to long format
climate_long = climate_pivot.melt(
    id_vars=['Country', 'Year'],  # Columns containing the country and year
    var_name='Indicator',  # Name for the climate variable
    value_name='FactValueNumeric'  # Name for the corresponding values (e.g., mean, sum, median)
)

# Map indicators to meaningful names
indicator_mapping = {
    'Mean_MinTemp': 'CLIMATE_MIN_TEMP',
    'Mean_MaxTemp': 'CLIMATE_MAX_TEMP',
    'Mean_Precipitation': 'CLIMATE_PRECIPITATION'
}
climate_long['IndicatorCode'] = climate_long['Indicator'].map(indicator_mapping)

# Add the full indicator name
climate_long['Indicator'] = climate_long['IndicatorCode'].map({
    'CLIMATE_MIN_TEMP': 'Mean Minimum Temperature',
    'CLIMATE_MAX_TEMP': 'Mean Maximum Temperature',
    'CLIMATE_PRECIPITATION': 'Mean Precipitation'
})

# Add required columns for consistency
climate_long['ValueType'] = 'Numeric'
climate_long['Period'] = climate_long['Year']  # Use 'Year' column here
climate_long['Location'] = climate_long['Country']  # Use 'Country' column here
climate_long['FactValueTranslationID'] = None  # Placeholder for additional metadata
climate_long['Language'] = 'EN'  # Assuming English for values

# Reorder columns to match malaria dataset
climate_long = climate_long[[
    'IndicatorCode', 'Indicator', 'ValueType', 'Location', 'Period',
    'FactValueNumeric', 'FactValueTranslationID', 'Language'
]]

# Print the first few rows to check
print(climate_long.head())

# Define the output file path
output_csv = "/content/drive/MyDrive/FINAL PROJECT DATA SCI/Min&Max Temp and Precipitation Dataset/long_tif_summary_per_country.csv"

# Save the climate data to the CSV file
climate_long.to_csv(output_csv, index=False)

In [None]:
# Define the list of countries to retain
target_countries = [
    'Afghanistan', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Azerbaijan', 'Bangladesh',
    'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil', 'Burkina Faso', 'Burundi',
    'Cabo Verde', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'China',
    'Colombia', 'Comoros', 'Republic of the Congo', 'Costa Rica', 'Ivory Coast',
    'North Korea', 'Democratic Republic of the Congo', 'Djibouti', 'Dominican Republic',
    'Ecuador', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia',
    'France', 'Gabon', 'Gambia', 'Georgia', 'Ghana', 'Guatemala', 'Guinea',
    'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'India', 'Indonesia', 'Iran', 'Iraq',
    'Kazakhstan', 'Kenya', 'Kyrgyzstan', 'Laos', 'Liberia', 'Madagascar', 'Malawi',
    'Malaysia', 'Mali', 'Mauritania', 'Mexico', 'Morocco', 'Mozambique',
    'Myanmar', 'Namibia', 'Nepal', 'Nicaragua', 'Niger', 'Nigeria', 'Oman', 'Pakistan',
    'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'South Korea',
    'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Sierra Leone',
    'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan', 'Sri Lanka', 'Sudan',
    'Suriname', 'Syria', 'Tajikistan', 'Thailand', 'East Timor', 'Togo', 'Turkey',
    'Turkmenistan', 'Uganda', 'United Arab Emirates', 'United Republic of Tanzania',
    'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'
]

# Load the dataset
input_file = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Population Dataset/World Population.csv'  # Replace with your file path
output_file = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Population Dataset/filtered_population.csv'  # Desired output file path

# Load the dataset
population_data = pd.read_csv(input_file)

# Filter for the target countries
filtered_countries = population_data[population_data['Country Name'].isin(target_countries)]

# Select columns for years 2000–2021 and retain necessary metadata
year_columns = [str(year) for year in range(2000, 2022)]
columns_to_keep = ['Country Name', 'Country Code'] + year_columns
filtered_data = filtered_countries[columns_to_keep]

# Save the filtered data to a new CSV file
filtered_data.to_csv(output_file, index=False)

print(f"Filtered data saved to {output_file}")

In [None]:
# Define the path to your file
input_file = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Population Dataset/filtered_population.csv'  # Adjust the file path as necessary

# Read the CSV, skipping metadata rows
population_data = pd.read_csv(input_file)

# Read the CSV (ensure the headers are set correctly)
population_data = pd.read_csv(input_file)

# Melt the data to make 'Period' (year) as a column
population_data_melted = population_data.melt(id_vars=["Country Name", "Country Code"],
                                              var_name="Period", value_name="Population")

# Convert 'Period' to numeric values (if needed)
population_data_melted["Period"] = pd.to_numeric(population_data_melted["Period"], errors='coerce')

# Ensure the 'Period' is sorted, if needed
population_data_melted = population_data_melted.sort_values(by=["Country Name", "Period"])

# View the transformed data
print(population_data_melted.head())

output_file = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Population Dataset/Main_filtered_population.csv'
population_data_melted.to_csv(output_file, index=False)

In [None]:
# Load the dataset
input_file = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Main Malaria Cases.csv'
population_data = pd.read_csv(input_file)

# Clean the population and malaria cases columns (removing commas and converting to float)
population_data['Population'] = population_data['Population'].replace({',': ''}, regex=True).astype(float)
population_data['Estimated number of malaria cases'] = population_data['Estimated number of malaria cases'].replace({',': ''}, regex=True).astype(float)

# Standardize column names
population_data.rename(columns={
    "Mean Precipitation": "Precipitation"
}, inplace=True)

# Create new features
population_data['Malaria Case Rate'] = population_data['Estimated number of malaria cases'] / population_data['Population'] * 100
population_data['Temperature Range'] = population_data['Max temp'] - population_data['Min temp']

# Convert Period to integer
population_data['Period'] = population_data['Period'].astype(int)

# View the final cleaned datq
population_data

output_csv = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv'

population_data.to_csv(output_csv, index=False)

In [None]:
# Define the headers explicitly
column_names = [
    "Location", "Period", "Population", "Estimated number of malaria cases",
    "Mean Max temp", "Mean Min temp", "Mean Precipitation","Median Max temp","Median Min temp","Median Precipitation","Sum Max temp","Sum Min temp","Sum Precipitation", "Malaria Case Rate", "Temperature Range"
]

# Read the CSV while assigning the headers
data = pd.read_csv('/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv', names=column_names, header=0)

# Filter for Afghanistan
afghanistan_data = data[data["Location"] == "Afghanistan"]

In [None]:
train_data = afghanistan_data[afghanistan_data["Period"].between(2000, 2020)]

test_data = afghanistan_data[afghanistan_data["Period"] == (2021)]

In [None]:
# Define features and target
features = ["Population", "Estimated number of malaria cases", "Mean Max temp", "Mean Min temp", "Mean Precipitation", "Median Max temp",
            "Median Min temp","Median Precipitation","Sum Max temp","Sum Min temp","Sum Precipitation", "Malaria Case Rate", "Temperature Range"]

target = "Estimated number of malaria cases"

# Training set
X_train = train_data[features]
y_train = train_data[target]

# Testing set
X_test = test_data[features]
y_test = test_data[target]  # This is for comparison after prediction

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the model
model = GradientBoostingRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions for 2006
y_pred = model.predict(X_test)

# Evaluation (excluding R-squared due to single sample)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Performance Metrics:")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

# Compare actual vs. predicted
print("Actual:", y_test.values)
print("Predicted:", y_pred)

In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor  # Import LazyRegressor
from sklearn.utils import shuffle

# Load your malaria dataset
data = pd.read_csv('/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv')

# Replace zeros with ones globally to avoid issues during log transformation
data = data.replace(0, 1)

# Ensure data is sorted for time series split
data = data.sort_values(by=['Location', 'Period'])

# Define features and target based on your dataset
# Assuming you want to predict 'Estimated number of malaria cases' and use various features for X
X = data[["Population", "Mean Max temp", "Mean Min temp", "Mean Precipitation",
"Median Max temp", "Median Min temp", "Median Precipitation",
"Sum Max temp", "Sum Min temp", "Sum Precipitation",
"Malaria Case Rate", "Temperature Average"]]
y = data['Estimated number of malaria cases']


# Shift negative values in temperature columns (before applying log transformation)
for column in temperature_columns:
    if data[column].min() < 0:
        shift_value = abs(data[column].min()) + 1
        data[column] += shift_value
        print(f"Shifting {column} by {shift_value} to handle negative values.")


# Shuffle the data to ensure randomness (like Boston dataset)
X, y = shuffle(X, y, random_state=13)

# Split data into training and testing sets (90% train, 10% test)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# Initialize LazyRegressor model
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit the model and get results
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)


In [None]:
class ShiftNegativeTemperatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, temperature_columns):
        self.temperature_columns = temperature_columns

    def fit(self, X, y=None):
        return self  # No fitting needed, just transforming

    def transform(self, X):
        # Shift negative values in temperature columns
        for column in self.temperature_columns:
            if X[column].min() < 0:
                shift_value = abs(X[column].min()) + 1
                X[column] += shift_value
        return X

class MalariaCasePredictor:
    def __init__(self, data_path, save_dir='/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data', n_splits=5, random_state=42, log_transform=True):
        # Initialize variables like data loading, feature selection, etc.
        self.n_splits = n_splits
        self.random_state = random_state
        self.log_transform = log_transform
        self.save_dir = save_dir
        self.columns = ['Location', 'Period','Population', 'Mean Max temp', 'Mean Min temp', 'Mean Precipitation',
            'Median Max temp', 'Median Min temp', 'Median Precipitation',
            'Sum Max temp', 'Sum Min temp', 'Sum Precipitation', 'Malaria Case Rate',
            'Temperature Average'
        ]
        self.feature_columns = ['Population', 'Mean Max temp', 'Mean Min temp', 'Mean Precipitation',
            'Median Max temp', 'Median Min temp', 'Median Precipitation',
            'Sum Max temp', 'Sum Min temp', 'Sum Precipitation', 'Malaria Case Rate',
            'Temperature Average'
        ]
        self.categorical_columns = ['Location']  # Add any categorical columns
        self.numeric_columns = [
            'Population', 'Mean Max temp', 'Mean Min temp', 'Mean Precipitation',
            'Median Max temp', 'Median Min temp', 'Median Precipitation',
            'Sum Max temp', 'Sum Min temp', 'Sum Precipitation', 'Malaria Case Rate',
            'Temperature Average'
        ]

        # Load your dataset
        self.data = pd.read_csv('/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv')

        # Check available years for each country
        years_per_country = self.data.groupby('Location')['Period'].unique()
        print(years_per_country)



    def _create_preprocessing_pipeline(self):
        """Create preprocessing pipeline to process features (numerical and categorical)"""

        # List of temperature columns you want to handle
        temperature_columns = [
            'Mean Max temp', 'Mean Min temp', 'Median Max temp', 'Median Min temp',
            'Sum Max temp', 'Sum Min temp', 'Temperature Average'
        ]

        # Numeric preprocessing (scale)
        numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])  # Standardize numeric data

        # Categorical preprocessing (encode)
        categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])  # One-hot encode categorical variables

        # Create the custom transformer to shift negative temperature values
        shift_temp_transformer = ShiftNegativeTemperatureTransformer(temperature_columns)

        # Combine all transformers using ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('shift_temp', shift_temp_transformer, self.numeric_columns),  # Apply to numeric columns
                ('num', numeric_transformer, self.numeric_columns),
                ('cat', categorical_transformer, self.categorical_columns)
            ]
        )

        return preprocessor

    def _log_transform(self, data):
      """Apply log transformation to the target variable or features, avoiding log(0)"""
      epsilon = 1  # Small constant to avoid log(0)
      return np.log(data + epsilon)

    def _reverse_log_transform(self, data):
      """Reverse log transformation to get back to the original scale"""
      epsilon = 1  # Small constant to avoid log(0)
      return np.exp(data) - epsilon


    def _create_hybrid_ensemble(self, X_train, y_train):
      """
      Trains a hybrid ensemble model using Gradient Boosting Regressor, XGBoost Regressor,
      and Random Forest Regressor.
      """
      # One-Hot encode the 'Location' column
      encoder = OneHotEncoder(drop='first', sparse_output=False)  # Avoid multicollinearity, sparse=False for SHAP
      location_encoded = encoder.fit_transform(X_train[['Location']])

      # Convert encoded location data into a DataFrame and concatenate with X_train
      location_encoded_df = pd.DataFrame(location_encoded, columns=encoder.get_feature_names_out(['Location']))
      X_train = pd.concat([X_train.reset_index(drop=True), location_encoded_df.reset_index(drop=True)], axis=1).drop(columns=['Location'], errors='ignore')

      # Define individual models
      self.gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
      self.xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
      self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

      # Create the ensemble with the models
      self.models = [self.gbr_model, self.xgb_model, self.rf_model]

      # Example of Voting Regressor (if this is a regression task)
      self.ensemble_model = VotingRegressor(estimators=[('gbr', self.gbr_model),
                                                        ('xgb', self.xgb_model),
                                                        ('rf', self.rf_model)],
                                          weights=[0.4, 0.4, 0.2])  # Adjust weights if needed

      # Train individual models
      self.gbr_model.fit(X_train, y_train)
      self.xgb_model.fit(X_train, y_train)
      self.rf_model.fit(X_train, y_train)

      # Train the ensemble model
      self.ensemble_model.fit(X_train, y_train)

      # Return the trained ensemble model
      return self.ensemble_model




    def compute_metrics(self, y_true, y_pred):
      absolute_errors = []
      relative_accuracies = []
      smapes = []

      epsilon = 1e-8  # Small constant to avoid division by zero or near-zero values
      tolerance = 1e-2  # Define how close the predicted value must be to actual zero to consider it 100% accurate

      for true, pred in zip(y_true, y_pred):
          # Compute Absolute Error
          absolute_errors.append(np.abs(pred - true))

          # Compute sMAPE
          if np.abs(true) < epsilon and np.abs(pred) < epsilon:
              smapes.append(0)  # Both true and predicted are 0, no error
          elif np.abs(true) < epsilon:  # Actual is 0, but predicted is non-zero
              smape = 100 * np.abs(pred)  # Max possible error when actual is 0
              smapes.append(smape)
          else:  # Normal case
              smape = 100 * (np.abs(pred - true) / ((np.abs(true) + np.abs(pred)) / 2))
              smapes.append(smape)

          # Compute Relative Accuracy (RA)
          if np.abs(true) < epsilon:  # Actual value is 0
              if np.abs(pred) < tolerance:  # Predicted value is close to 0
                  relative_accuracies.append(100)  # Perfect prediction
              else:  # Predicted value is far from 0
                  ra = max(0, 100 - (np.abs(pred) / tolerance) * 100)
                  relative_accuracies.append(ra)
          else:  # Normal case where actual is non-zero
              ra = max(0, 100 - (np.abs(pred - true) / np.abs(true)) * 100)
              relative_accuracies.append(ra)

      # Compute mean values
      mean_absolute_error = np.mean(absolute_errors)
      mean_smape = np.mean(smapes)
      mean_relative_accuracy = np.mean(relative_accuracies)

      return mean_relative_accuracy, mean_absolute_error, mean_smape



    def predict_malaria_cases(self, countries=None):
      """
      Predict malaria cases for specified countries using data from 2000-2021 to predict 2022.

      Args:
          countries (list): List of countries to predict for.

      Returns:
          list: Detailed prediction results.
      """
      # Use all countries if none specified
      all_countries = self.data['Location'].unique()
      countries = countries if countries is not None else all_countries

      detailed_results = []
      self.all_actual = []
      self.all_predicted = []

      # TimeSeriesSplit for cross-validation
      tscv = TimeSeriesSplit(n_splits=5)

      for country in countries:
          print(f"=====================================================================\nProcessing country: {country}\n-----------------------------------")

          # Filter country data
          country_data = self.data[self.data['Location'] == country].copy()

          if len(country_data) < 2:
              print(f"Insufficient data for {country}")
              continue

          # Split into training and test sets (train: 2000-2020, test: 2021)
          train_data = country_data[country_data['Period'] <= 2020]
          test_data = country_data[country_data['Period'] == 2021]

          if len(train_data) < 2:
              print(f"Not enough data for training for {country}. Skipping this country.")
              continue

          if len(test_data) < 1:
              print(f"No data for 2021 for {country}. Skipping this country.")
              continue

          # Prepare features and target for training
          self.X_train = train_data[self.numeric_columns + self.categorical_columns]  # Include categorical features
          self.y_train = train_data['Estimated number of malaria cases']

          # Log transform target if needed
          if self.log_transform:
              self.y_train = self._log_transform(self.y_train)

          try:
              # Preprocessing and model pipeline
              preprocessor = self._create_preprocessing_pipeline()
              ensemble_model = self._create_hybrid_ensemble(self.X_train, self.y_train)  # Get the ensemble model, now with RandomForest

              # Create pipeline for the model
              pipeline = Pipeline([('preprocessor', preprocessor), ('ensemble_model', ensemble_model)])

              # TimeSeriesSplit Cross-validation
              for train_idx, val_idx in tscv.split(self.X_train, self.y_train):
                  # Split the data into training and validation sets
                  X_train_cv, X_val_cv = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx]
                  y_train_cv, y_val_cv = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx]

                  # Train the model and make predictions
                  pipeline.fit(X_train_cv, y_train_cv)
                  y_pred_cv = pipeline.predict(X_val_cv)

                  # Calculate metrics for cross-validation fold
                  mean_relative_accuracy, mean_absolute_error, mean_smape = self.compute_metrics(y_val_cv, y_pred_cv)
                  print(f"CV Fold - sMAPE: {mean_smape:.2f}, MAE: {mean_absolute_error:.2f}, Relative Accuracy: {mean_relative_accuracy:.2f}")

              # Fit the model on the full training data
              pipeline.fit(self.X_train, self.y_train)

              # Make predictions for 2021 (test set)
              X_test = test_data[self.numeric_columns + self.categorical_columns].iloc[-1:]
              y_test = test_data['Estimated number of malaria cases']

              # Prediction for 2021
              y_pred_2021 = pipeline.predict(X_test)

              # Reverse log transformation for 2021 predictions if needed
              if self.log_transform:
                  y_pred_2021 = self._reverse_log_transform(y_pred_2021)

              # Debugging output for predicted and actual values
              print(f"-----------------------------------\nActual for {country}:\n2021 = {y_test.iloc[0]}")
              print(f"-----------------------------------\nPredicted for {country}: \n2021 = {int(round(y_pred_2021[0]))}")

              # Forecast for 2022 (using the model trained on 2000-2021)
              last_known_features = train_data[self.numeric_columns + self.categorical_columns].iloc[-1:]
              y_pred_2022 = pipeline.predict(last_known_features)

              # Reverse log transformation for 2022 predictions if needed
              if self.log_transform:
                  y_pred_2022 = self._reverse_log_transform(y_pred_2022)

              # Debugging output for predicted 2022
              print(f"2022 = {int(round(y_pred_2022[0]))}")

              joblib.dump(self.gbr_model, f'{self.save_dir}/gbr_model.pkl')
              joblib.dump(self.xgb_model, f'{self.save_dir}/xgb_model.pkl')
              joblib.dump(self.rf_model, f'{self.save_dir}/rf_model.pkl')
              self.X_train.to_csv(f'{self.save_dir}/X_train.csv', index=False)
              self.y_train.to_csv(f'{self.save_dir}/y_train.csv', index=False)

          except ValueError as e:
              # If there's a ValueError, likely due to empty data for a country/year
              print(f"Error processing data for {country}: {e}")
              continue

          # Calculate metrics for 2021
          smape_2021, mae_2021, relative_accuracy_2021 = self.compute_metrics(y_test, y_pred_2021)

          # # Calculate metrics for 2022
          smape_2022, mae_2022, relative_accuracy_2022 = self.compute_metrics(y_test, y_pred_2022)

          # Collect all actual and predicted values for average accuracy
          if len(y_test) == 1 and len(y_pred_2021) == 1:
              actual_value = y_test.iloc[0]  # Extract the actual value for 2021
              predicted_value = int(round(y_pred_2021[0]))  # Extract the predicted value for 2021

              # Append the extracted values to their respective arrays
              self.all_actual.append(actual_value)

              # Round the predicted value before appending
              self.all_predicted.append(int(round(predicted_value)))

              print(f"-----------------------------------\nAppended Actual: {actual_value} \nAppended Predicted: {predicted_value}")
          else:
              print(f"Warning: Unexpected output sizes for {country} - y_test: {len(y_test)}, y_pred_2021: {len(y_pred_2021)}")

      # Convert lists to numpy arrays or DataFrame
      self.all_actual_array = np.array(self.all_actual)
      self.all_predicted_array = np.array(self.all_predicted)

      # Convert arrays to DataFrame for saving
      self.all_actual_df = pd.DataFrame(self.all_actual_array, columns=['Actual Values'])
      self.all_predicted_df = pd.DataFrame(self.all_predicted_array, columns=['Predicted Values'])

      # Save the DataFrames to CSV
      self.all_actual_df.to_csv(f'{self.save_dir}/all_actual.csv', index=False)
      self.all_predicted_df.to_csv(f'{self.save_dir}/all_predicted.csv', index=False)

      # Debug output for accuracy calculation
      print(f"All Actual: {self.all_actual}")
      print(f"All Predicted: {self.all_predicted}")

      return detailed_results



# Example usage
if __name__ == "__main__":
    predictor = MalariaCasePredictor(
        data_path='/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv', save_dir='/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data',
        n_splits=5
    )
    results = predictor.predict_malaria_cases(['Afghanistan', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Azerbaijan',
                                              'Bangladesh', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil',
                                              'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
                                              'Central African Republic', 'Chad', 'China', 'Colombia', 'Comoros',
                                              'Costa Rica', 'Democratic Republic of the Congo', 'Djibouti',
                                              'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
                                              'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'France', 'Gabon',
                                              'Gambia', 'Georgia', 'Ghana', 'Guatemala','Guinea', 'Guinea-Bissau', 'Guyana',
                                              'Haiti', 'Honduras', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ivory Coast',
                                              'Kenya', 'Kyrgyzstan', 'Laos', 'Liberia', 'Madagascar', 'Malawi','Malaysia',
                                              'Mali', 'Mauritania', 'Mexico', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',
                                              'Nepal', 'Nicaragua', 'Niger', 'Nigeria', 'North Korea', 'Oman', 'Pakistan',
                                              'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines',
                                              'Republic of the Congo', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia',
                                              'Senegal', 'Sierra Leone', 'Solomon Islands', 'Somalia', 'South Africa',
                                              'South Korea', 'South Sudan', 'Sri Lanka', 'Sudan', 'Suriname', 'Syria',
                                              'Tajikistan', 'Thailand', 'Togo', 'Turkey', 'Turkmenistan', 'Uganda','United Arab Emirates',
                                              'United Republic of Tanzania', 'Uzbekistan', 'Vanuatu', 'Venezuela',
                                              'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'])

In [None]:
# File paths for reading X_train, y_train, and models
X_train_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/X_train.csv'
y_train_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/y_train.csv'

gbr_model_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/gbr_model.pkl'
xgb_model_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/xgb_model.pkl'
rf_model_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/rf_model.pkl'

# Reading X_train and y_train data
X_train = pd.read_csv(X_train_path)
y_train = train_data = pd.read_csv(y_train_path)

gbr_model = joblib.load(gbr_model_path)
xgb_model = joblib.load(xgb_model_path)
rf_model = joblib.load(rf_model_path)

def analyze_feature_importance(X_train, y_train, gbr_model, xgb_model, rf_model):
    """
    Analyzes the importance of features for the trained ensemble model.
    """
    y_train = train_data['Estimated number of malaria cases']

    if X_train is None or y_train is None:
        raise ValueError("X_train and y_train are required.")

    # Prepare data for SHAP analysis
    X_train_for_shap = X_train.copy()
    location_backup = X_train['Location'].copy()

    # One-Hot encode the 'Location' column
    encoder = OneHotEncoder(drop='first')  # Avoid multicollinearity by dropping the first category
    location_encoded = encoder.fit_transform(X_train[['Location']]).toarray()

    # Convert the encoded location data into a DataFrame and concatenate with X_train
    location_encoded_df = pd.DataFrame(location_encoded, columns=encoder.get_feature_names_out(['Location']))
    X_train_for_shap = pd.concat([X_train_for_shap, location_encoded_df], axis=1).drop(columns=['Location'], errors='ignore')

    # Compute SHAP values for each model
    explainer_gbr = shap.Explainer(gbr_model, X_train_for_shap)
    shap_values_gbr = explainer_gbr(X_train_for_shap)

    explainer_xgb = shap.Explainer(xgb_model, X_train_for_shap)
    shap_values_xgb = explainer_xgb(X_train_for_shap)

    explainer_rf = shap.Explainer(rf_model, X_train_for_shap)
    shap_values_rf = explainer_rf(X_train_for_shap)

    # Visualize SHAP values (optional)
    shap.summary_plot(shap_values_gbr, X_train_for_shap)
    shap.summary_plot(shap_values_xgb, X_train_for_shap)
    shap.summary_plot(shap_values_rf, X_train_for_shap)

    # Combine SHAP values for each model (optional)
    combined_shap_values = np.mean([shap_values_gbr.values, shap_values_xgb.values, shap_values_rf.values], axis=0)

    # Create a combined summary plot (optional)
    shap.summary_plot(combined_shap_values, X_train_for_shap)

    # Reattach the 'Location' column from the backup
    X_train_for_shap['Location'] = location_backup

    # No return value, as the function primarily performs analysis and

# Call the function to execute the analysis
analyze_feature_importance(X_train, y_train, gbr_model, xgb_model, rf_model)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# File paths for actual and predicted values
all_actual_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/all_actual.csv'
all_predicted_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/Model and Train data/all_predicted.csv'

def prediction_vs_actual(y_true, y_pred, model_name='Model'):
    """
    Compares actual vs predicted values and plots the results.
    """
    # Create the plot
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, color='blue', alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], color='red', lw=2, label="Ideal Line")
    plt.xlabel('Actual Malaria Cases')
    plt.ylabel('Predicted Malaria Cases')
    plt.title(f'{model_name} - Prediction vs Actual')
    plt.legend()
    plt.show()

# Use X_train_without_location (X_train that was used during model training, without 'Location' column)
X_train_without_location = X_train.drop(columns=['Location'], errors='ignore')

# Example usage for Gradient Boosting Regressor (GBR), XGBoost (XGB), and Random Forest (RF)

# GBR predictions
gbr_pred = gbr_model.predict(X_train_without_location)
prediction_vs_actual(y_train, gbr_pred, model_name='Gradient Boosting Regressor')

# XGB predictions
xgb_pred = xgb_model.predict(X_train_without_location)
prediction_vs_actual(y_train, xgb_pred, model_name='XGBoost')

# RF predictions
rf_pred = rf_model.predict(X_train_without_location)
prediction_vs_actual(y_train, rf_pred, model_name='Random Forest')

# Unsupervised Learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [None]:
class MalariaClustering:
    def __init__(self, df):
        self.df = df
        self.country_data, self.features_scaled = self.preprocess_data_by_country(df)

    def preprocess_data_by_country(self, df):
        # Aggregate data by 'Location' (i.e., country) for features like 'Estimated number of malaria cases', temperature, etc.
        country_data = df.groupby('Location').agg({
            'Estimated number of malaria cases': 'mean',
            'Mean Max temp': 'mean',
            'Mean Min temp': 'mean',
            'Mean Precipitation': 'mean',
            'Temperature Average': 'mean',
            'Median Max temp': 'mean',  # Add more columns as needed
            'Median Min temp': 'mean',
            'Median Precipitation': 'mean',
            'Sum Max temp': 'mean',
            'Sum Min temp': 'mean',
            'Sum Precipitation': 'mean',
            'Malaria Case Rate': 'mean',  # Include 'Malaria Case Rate'
        }).reset_index()

        # Select only numeric columns for clustering (excluding 'Location')
        features = country_data.drop(columns=['Location'])  # Drop 'Location' since it’s non-numeric

        # Ensure that there are no non-numeric values in the features
        features = features.apply(pd.to_numeric, errors='coerce')  # Force conversion to numeric, replacing errors with NaN

        # Drop rows with NaN values (if any)
        features = features.dropna()

        # Scale the numeric features
        scaler = MinMaxScaler()
        features_scaled = scaler.fit_transform(features)  # Scale only numeric features

        return country_data, features_scaled

    def tune_dbscan(self, eps_values=[0.2, 0.5, 0.7], min_samples_values=[3, 5, 7]):
        best_score = -1
        best_eps = None
        best_min_samples = None
        best_clusters = None

        for eps in eps_values:
            for min_samples in min_samples_values:
                # Apply DBSCAN with the given parameters
                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                dbscan_clusters = dbscan.fit_predict(self.features_scaled)

                # Exclude noise points (-1)
                if len(set(dbscan_clusters)) > 1:  # Ensure there are at least 2 clusters
                    score = silhouette_score(self.features_scaled, dbscan_clusters)
                    print(f"DBSCAN (eps={eps}, min_samples={min_samples}) - Silhouette Score: {score:.4f}")

                    if score > best_score:
                        best_score = score
                        best_eps = eps
                        best_min_samples = min_samples
                        best_clusters = dbscan_clusters

        print(f"Best DBSCAN parameters - eps: {best_eps}, min_samples: {best_min_samples}, Silhouette Score: {best_score:.4f}")

        return best_clusters

    def tune_kmeans(self, k_values=[2, 3, 4, 5, 6]):
        best_score = -1
        best_k = None
        best_clusters = None

        for k in k_values:
            # Apply KMeans with the given k
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans_clusters = kmeans.fit_predict(self.features_scaled)

            # Compute silhouette score
            score = silhouette_score(self.features_scaled, kmeans_clusters)
            print(f"KMeans (k={k}) - Silhouette Score: {score:.4f}")

            if score > best_score:
                best_score = score
                best_k = k
                best_clusters = kmeans_clusters

        print(f"Best KMeans parameters - k: {best_k}, Silhouette Score: {best_score:.4f}")

        return best_clusters

    def tune_agglomerative(self, linkage_values=['ward', 'complete', 'average']):
        best_score = -1
        best_linkage = None
        best_clusters = None

        for linkage in linkage_values:
            # Apply AgglomerativeClustering with the given linkage
            agglomerative = AgglomerativeClustering(linkage=linkage)
            agglomerative_clusters = agglomerative.fit_predict(self.features_scaled)

            # Compute silhouette score
            score = silhouette_score(self.features_scaled, agglomerative_clusters)
            print(f"Agglomerative (linkage={linkage}) - Silhouette Score: {score:.4f}")

            if score > best_score:
                best_score = score
                best_linkage = linkage
                best_clusters = agglomerative_clusters

        print(f"Best Agglomerative parameters - linkage: {best_linkage}, Silhouette Score: {best_score:.4f}")

        return best_clusters

    def plot_clusters_by_country(self, clustering_results, filtered_features_scaled):
        # PCA for dimensionality reduction (reduce to 2D)
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(filtered_features_scaled)

        # t-SNE for further dimensionality reduction
        tsne = TSNE(n_components=2, random_state=42)
        tsne_result = tsne.fit_transform(filtered_features_scaled)

        # Plot for each clustering result
        for name, clusters in clustering_results.items():
            plt.figure(figsize=(12, 6))

            # PCA plot
            plt.subplot(1, 2, 1)
            plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis')
            plt.title(f'{name} - PCA Visualization')
            plt.colorbar()

            # t-SNE plot
            plt.subplot(1, 2, 2)
            plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=clusters, cmap='viridis')
            plt.title(f'{name} - t-SNE Visualization')
            plt.colorbar()

            plt.show()

    def evaluate_clustering(self, clustering_results):
        for name, clusters in clustering_results.items():
            score = silhouette_score(self.features_scaled, clusters)
            print(f'Silhouette Score for {name}: {score:.4f}')

    def main(self):
        # Apply DBSCAN to the country-level data
        dbscan_clusters = self.tune_dbscan()

        # Apply KMeans to the country-level data
        kmeans_clusters = self.tune_kmeans()

        # Apply Agglomerative Clustering to the country-level data
        agglomerative_clusters = self.tune_agglomerative()

        # Visualize all clustering results
        clustering_results = {
            'DBSCAN': dbscan_clusters,
            'KMeans': kmeans_clusters,
            'Agglomerative': agglomerative_clusters
        }

        self.plot_clusters_by_country(clustering_results, self.features_scaled)

        # Evaluate clusters with silhouette scores
        self.evaluate_clustering(clustering_results)

# Example usage (with your dataset):
df = pd.read_csv("/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv")
malaria_clustering = MalariaClustering(df)
malaria_clustering.main()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import geopandas as gpd
import difflib

In [None]:
class MalariaClustering:
    def __init__(self, df, shapefile_path=None):
        self.df = df
        self.features_scaled = None
        self.country_clusters = {}
        self.shapefile_path = shapefile_path
        self.world_map = None

    def diagnose_data(self):
        print("DataFrame Information:")
        print(f"Total rows: {len(self.df)}")
        print(f"Columns: {list(self.df.columns)}")

        print("\nLocation Column Analysis:")
        print(self.df['Location'].value_counts())

    def preprocess_data_by_country(self, country_data):
        try:
            numeric_columns = [
                'Estimated number of malaria cases',
                'Mean Max temp',
                'Mean Min temp',
                'Mean Precipitation',
                'Temperature Average',
                'Median Max temp',
                'Median Min temp',
                'Median Precipitation',
                'Sum Max temp',
                'Sum Min temp',
                'Sum Precipitation',
                'Malaria Case Rate'
            ]

            available_columns = [col for col in numeric_columns if col in country_data.columns]

            if not available_columns:
                print(f"No valid numeric columns found for processing")
                return None, None

            features = country_data[available_columns]
            features = features.apply(pd.to_numeric, errors='coerce')
            features = features.dropna()

            if features.empty:
                print(f"No valid data after cleaning")
                return None, None

            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(features)

            return features, features_scaled

        except Exception as e:
            print(f"Error in preprocessing: {e}")
            return None, None

    def apply_clustering(self):
        # Get unique countries
        unique_countries = self.df['Location'].unique()
        print(f"Total unique countries: {len(unique_countries)}")

        for country in unique_countries:
            try:
                # Filter data for the specific country
                country_data = self.df[self.df['Location'] == country]

                # Preprocess data for the country
                _, features_scaled = self.preprocess_data_by_country(country_data)

                # Check if we have enough data for clustering
                if features_scaled is not None and len(features_scaled) > 1:
                    best_clusters = self.tune_agglo(features_scaled)
                    if best_clusters is not None:
                        self.country_clusters[country] = best_clusters
                        print(f"Clustered {country} successfully")
                else:
                    print(f"Skipping {country} - Not enough data for clustering")

            except Exception as e:
                print(f"Error clustering {country}: {e}")

        print(f"Clustering results: {list(self.country_clusters.keys())}")

    def tune_agglo(self, features_scaled, n_clusters_values=[2, 3, 4, 5]):
        best_score = -1
        best_clusters = None

        for n_clusters in n_clusters_values:
            try:
                agglo = AgglomerativeClustering(n_clusters=n_clusters)
                agglo_clusters = agglo.fit_predict(features_scaled)

                # Only compute silhouette score if we have more than one cluster
                if len(np.unique(agglo_clusters)) > 1:
                    score = silhouette_score(features_scaled, agglo_clusters)
                    print(f"Clusters: {n_clusters}, Silhouette Score: {score:.4f}")

                    if score > best_score:
                        best_score = score
                        best_clusters = agglo_clusters
            except Exception as e:
                print(f"Clustering error with {n_clusters} clusters: {e}")

        return best_clusters

    def load_shapefile(self):
        """
        Load the world shapefile for mapping.
        """
        if not self.shapefile_path:
            raise ValueError("No shapefile path provided. Set shapefile_path during initialization.")

        # Load the world map
        self.world_map = gpd.read_file(self.shapefile_path)

    def _find_best_match(self, name, name_list, cutoff=0.6):
        """
        Find the best matching name in name_list using difflib.
        """
        matches = difflib.get_close_matches(name.lower().strip(),
                                            [n.lower().strip() for n in name_list],
                                            n=1,
                                            cutoff=cutoff)

        return matches[0] if matches else None

    def plot_country_clusters(self, output_path=None):
      if self.world_map is None:
          self.load_shapefile()

      print("Total countries with clusters:", len(self.country_clusters))

      # Track matched and unmatched countries
      matched_countries = []
      unmatched_countries = []

      cluster_data = []
      for country, cluster in self.country_clusters.items():
          # Direct match
          world_map_row = self.world_map[self.world_map['ADMIN'] == country]

          if not world_map_row.empty:
              cluster_data.append({
                  'country': country,
                  'cluster': cluster[0] if isinstance(cluster, np.ndarray) else cluster,
                  'geometry': world_map_row['geometry'].values[0]
              })
              matched_countries.append(country)
          else:
              unmatched_countries.append(country)

      print("Matched countries:", len(matched_countries))
      print("Unmatched countries:", len(unmatched_countries))
      print("Unmatched countries list:", unmatched_countries)

      # Create a GeoDataFrame from the cluster data
      cluster_gdf = gpd.GeoDataFrame(cluster_data)

      # Plot the map with INCREASED size and resolution
      plt.figure(figsize=(20, 10), dpi=1638)
      ax = cluster_gdf.plot(column='cluster', cmap='viridis',
                            legend=True,
                            legend_kwds={'label': 'Clusters', 'orientation': 'horizontal'},
                            edgecolor='0.1',
                            linewidth=0.1)
      plt.title('Malaria Case Clusters by Country', fontsize=15)
      plt.axis('off')

      # Adjust layout to prevent cutting off the legend
      plt.tight_layout(pad=5)

      # Save the plot if output path is provided
      if output_path:
          plt.savefig(output_path, dpi=1638, bbox_inches='tight')

      plt.close()

      return cluster_gdf

    def run(self, shapefile_path=None):
        """
        Run the entire clustering and mapping process.
        """
        if shapefile_path:
            self.shapefile_path = shapefile_path

        # First diagnose the data
        self.diagnose_data()

        # Then attempt clustering
        self.apply_clustering()

        # Optional: plot clusters if shapefile is available
        if self.shapefile_path:
            cluster_gdf = self.plot_country_clusters(output_path="/content/drive/MyDrive/FINAL PROJECT DATA SCI/malaria_clusters_map.png")

            # Print out details about the clustering results
            print("\nClustering Details:")
            print(cluster_gdf)

# Example usage
df = pd.read_csv("/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv")
malaria_clustering = MalariaClustering(df, shapefile_path="/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/modified_countries.shp")
malaria_clustering.run()

In [None]:
import pandas as pd
import geopandas as gpd

# Define the mapping
country_name_mapping = {
    'São Tomé and Principe': 'Sao Tome and Principe',
    'eSwatini': 'Eswatini'
}

# Additional mapping to handle potential variations
additional_mapping = {
    'United States of America': 'United States',
    'Congo': 'Republic of Congo',
    'Dem. Rep. Congo': 'Democratic Republic of the Congo',
    # Add more mappings as needed
}

# Update the mapping dictionary
country_name_mapping.update(additional_mapping)

# Load the shapefile
shapefile_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/ne_10m_admin_0_countries.shp'
df = pd.read_csv("/content/drive/MyDrive/FINAL PROJECT DATA SCI/V2 Main Malaria Cases.csv")

# Apply mapping to the dataset
df['Location'] = df['Location'].replace(country_name_mapping)

# Load and modify the shapefile
gdf = gpd.read_file(shapefile_path)

# Apply mapping to the shapefile's ADMIN column
gdf['ADMIN'] = gdf['ADMIN'].replace(country_name_mapping)

# Get unique values
unique_locations = df['Location'].unique()
unique_admin_values = gdf['ADMIN'].unique()

# Print unique values
print("Unique Locations in Dataset:")
print(sorted(unique_locations))
print("\nNumber of unique locations in dataset:", len(unique_locations))

print("\nUnique Countries in Shapefile:")
print(sorted(unique_admin_values))
print("\nNumber of unique countries in shapefile:", len(unique_admin_values))

# Find countries in the dataset that are not in the shapefile
dataset_countries = set(df['Location'].unique())
shapefile_countries = set(gdf['ADMIN'].unique())

print("\nCountries in dataset but not in shapefile:")
missing_from_shapefile = sorted(dataset_countries - shapefile_countries)
print(missing_from_shapefile)
print("Number of missing countries:", len(missing_from_shapefile))

print("\nCountries in shapefile but not in dataset:")
extra_in_shapefile = sorted(shapefile_countries - dataset_countries)
print(extra_in_shapefile)
print("Number of extra countries:", len(extra_in_shapefile))

# Filter the shapefile to include only countries in the dataset
filtered_gdf = gdf[gdf['ADMIN'].isin(dataset_countries)]

# Save the filtered shapefile
output_shapefile_path = '/content/drive/MyDrive/FINAL PROJECT DATA SCI/World Shape file/modified_countries.shp'
filtered_gdf.to_file(output_shapefile_path)

print(f"\nFiltered shapefile saved to: {output_shapefile_path}")
print("Number of countries in filtered shapefile:", len(filtered_gdf))

# Optional: Print filtered countries for verification
print("\nCountries in filtered shapefile:")
print(sorted(filtered_gdf['ADMIN'].unique()))

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/SupervisedLearningMalaria"

In [2]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [None]:
!ls

In [None]:
!git config --global user.name "Odysseus-droid"
!git config --global user.email "odylim123@gmail.com"

In [1]:
!git add .
!git commit -m "Add all codes Supervised and Unsupervised"

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [None]:
!git push https://github.com/Odysseus-droid/SupervisedLearningMalaria.git main