In [707]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import rasterio

In [None]:
df = pd.read_csv(('Assets/Data/global-data-on-sustainable-energy.csv'))

# 2️⃣ Display missing values summary
missing_values = df.isnull().sum()
print("Missing Values Summary:\n", missing_values)

# 3️⃣ Calculate percentage of missing values
missing_percentages = (missing_values / df.shape[0]) * 100
print("\nMissing Values Percentage:\n", missing_percentages)



# 4️⃣ Drop rows where critical location-based information is missing
df_cleaned_data = df.dropna(subset=['Longitude', 'Latitude', 'Density(P/Km2)', 'Land Area(Km2)'])
print("\nShape after dropping location-based missing values:", df_cleaned_data.shape)

df_cleaned_data = df_cleaned_data.copy()

# 5️⃣ Fill missing values for key columns with **zero** where appropriate
# List of columns to fill missing values with 0
zero_fill_columns = [
    'Access to electricity (% of population)',
    'Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)',
    'Electricity from renewables (TWh)',
    'Low-carbon electricity (% electricity)',
    'Access to clean fuels for cooking',
    'Financial flows to developing countries (US $)'
]

# Use .loc to explicitly modify the DataFrame
df_cleaned_data.loc[:, zero_fill_columns] = df_cleaned_data[zero_fill_columns].fillna(0)

# Columns to fill missing values with the mean per 'Entity'
mean_fill_columns = [
    'gdp_per_capita', 
    'gdp_growth', 
    'Renewable energy share in the total final energy consumption (%)',
    'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
    'Value_co2_emissions_kt_by_country',
    'Renewable-electricity-generating-capacity-per-capita'
]

# Use .loc and apply transform correctly
for col in mean_fill_columns:
    df_cleaned_data.loc[:, col] = df_cleaned_data.groupby('Entity')[col].transform(lambda x: x.fillna(x.mean()))

# Fill missing values in 'Financial flows to developing countries (US $)' explicitly
df_cleaned_data.loc[:, 'Financial flows to developing countries (US $)'] = df_cleaned_data['Financial flows to developing countries (US $)'].fillna(0)


    # Fill remaining missing values with the country's mean
df_cleaned_data[col] = df_cleaned_data.groupby('Entity')[col].transform(lambda x: x.fillna(x.mean()))

# 7️⃣ Handle missing financial flow data (replace with 0 for developing countries)
df_cleaned_data['Financial flows to developing countries (US $)'] = df_cleaned_data['Financial flows to developing countries (US $)'].fillna(0)
df_cleaned_data['Renewables (% equivalent primary energy)'] = df_cleaned_data['Renewables (% equivalent primary energy)'].fillna(0)

# 8️⃣ Verify if missing values are handled
print("\nMissing Values After Cleaning:\n", df_cleaned_data.isnull().sum())

# 9️⃣ Check for duplicates
duplicates = df_cleaned_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# 🔟 Display final cleaned dataset
df_cleaned_data.head()
