In [724]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import rasterio

In [725]:
df = pd.read_csv(('Assets/Data/global-data-on-sustainable-energy.csv'))

# 2️⃣ Display missing values summary
missing_values = df.isnull().sum()
print("Missing Values Summary:\n", missing_values)

# 3️⃣ Calculate percentage of missing values
missing_percentages = (missing_values / df.shape[0]) * 100
print("\nMissing Values Percentage:\n", missing_percentages)



# 4️⃣ Drop rows where critical location-based information is missing
df_cleaned_data = df.dropna(subset=['Longitude', 'Latitude', 'Density(P/Km2)', 'Land Area(Km2)'])
print("\nShape after dropping location-based missing values:", df_cleaned_data.shape)

df_cleaned_data = df_cleaned_data.copy()

# 5️⃣ Fill missing values for key columns with **zero** where appropriate
# List of columns to fill missing values with 0
zero_fill_columns = [
    'Access to electricity (% of population)',
    'Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)',
    'Electricity from renewables (TWh)',
    'Low-carbon electricity (% electricity)',
    'Access to clean fuels for cooking',
    'Financial flows to developing countries (US $)'
]

# Use .loc to explicitly modify the DataFrame
df_cleaned_data.loc[:, zero_fill_columns] = df_cleaned_data[zero_fill_columns].fillna(0)

# Columns to fill missing values with the mean per 'Entity'
mean_fill_columns = [
    'gdp_per_capita', 
    'gdp_growth', 
    'Renewable energy share in the total final energy consumption (%)',
    'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
    'Value_co2_emissions_kt_by_country',
    'Renewable-electricity-generating-capacity-per-capita'
]

# Use .loc and apply transform correctly
for col in mean_fill_columns:
    df_cleaned_data.loc[:, col] = df_cleaned_data.groupby('Entity')[col].transform(lambda x: x.fillna(x.mean()))

# Fill missing values in 'Financial flows to developing countries (US $)' explicitly
df_cleaned_data.loc[:, 'Financial flows to developing countries (US $)'] = df_cleaned_data['Financial flows to developing countries (US $)'].fillna(0)


    # Fill remaining missing values with the country's mean
df_cleaned_data[col] = df_cleaned_data.groupby('Entity')[col].transform(lambda x: x.fillna(x.mean()))

# 7️⃣ Handle missing financial flow data (replace with 0 for developing countries)
df_cleaned_data['Financial flows to developing countries (US $)'] = df_cleaned_data['Financial flows to developing countries (US $)'].fillna(0)
df_cleaned_data['Renewables (% equivalent primary energy)'] = df_cleaned_data['Renewables (% equivalent primary energy)'].fillna(0)

# 8️⃣ Verify if missing values are handled
print("\nMissing Values After Cleaning:\n", df_cleaned_data.isnull().sum())

# 9️⃣ Check for duplicates
duplicates = df_cleaned_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# 🔟 Display final cleaned dataset
df_cleaned_data.head()


Missing Values Summary:
 Entity                                                                 0
Year                                                                   0
Access to electricity (% of population)                               10
Access to clean fuels for cooking                                    169
Renewable-electricity-generating-capacity-per-capita                 931
Financial flows to developing countries (US $)                      2089
Renewable energy share in the total final energy consumption (%)     194
Electricity from fossil fuels (TWh)                                   21
Electricity from nuclear (TWh)                                       126
Electricity from renewables (TWh)                                     21
Low-carbon electricity (% electricity)                                42
Primary energy consumption per capita (kWh/person)                     0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)          207
Value_co2_emissions_kt_by_

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.613591,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59482,1.64,760.0,0.0,6.163893,439.055765,60,652230.0,33.93911,67.709953
1,Afghanistan,2001,4.074574,7.2,8.86,130000.0,45.6,0.09,0.0,0.5,...,236.89185,1.74,730.0,0.0,6.163893,439.055765,60,652230.0,33.93911,67.709953
2,Afghanistan,2002,9.409158,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86215,1.4,1029.999971,0.0,6.163893,179.426579,60,652230.0,33.93911,67.709953
3,Afghanistan,2003,14.738506,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.96822,1.4,1220.000029,0.0,8.832278,190.683814,60,652230.0,33.93911,67.709953
4,Afghanistan,2004,20.064968,10.9,7.75,0.0,44.24,0.33,0.0,0.56,...,204.23125,1.2,1029.999971,0.0,1.414118,211.382074,60,652230.0,33.93911,67.709953
