In [1]:
import pandas as pd

# Load raw dataset
df = pd.read_csv("../data/raw/infrastructure/share-electricity-renewables.csv")

# Preview structure
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7943 entries, 0 to 7942
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Entity                      7943 non-null   object 
 1   Code                        6395 non-null   object 
 2   Year                        7943 non-null   int64  
 3   Renewables - % electricity  7943 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 248.3+ KB


(          Entity Code  Year  Renewables - % electricity
 0  ASEAN (Ember)  NaN  2000                   19.347086
 1  ASEAN (Ember)  NaN  2001                   19.066320
 2  ASEAN (Ember)  NaN  2002                   17.664303
 3  ASEAN (Ember)  NaN  2003                   16.672487
 4  ASEAN (Ember)  NaN  2004                   15.700016,
 None)

In [2]:
df = df.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    df.columns[3]: "renewables_share"
})

df.head()


Unnamed: 0,country,iso_code,year,renewables_share
0,ASEAN (Ember),,2000,19.347086
1,ASEAN (Ember),,2001,19.06632
2,ASEAN (Ember),,2002,17.664303
3,ASEAN (Ember),,2003,16.672487
4,ASEAN (Ember),,2004,15.700016


In [3]:
# Ensure numeric type
df["renewables_share"] = pd.to_numeric(df["renewables_share"], errors="coerce")

# Convert year to int
df["year"] = df["year"].astype(int)


In [4]:
rows_before = len(df)

df = df[df["iso_code"].notna()].copy()
df.reset_index(drop=True, inplace=True)

print("Rows before:", rows_before)
print("Rows after :", len(df))
print("Rows removed:", rows_before - len(df))


Rows before: 7943
Rows after : 6395
Rows removed: 1548


In [5]:
print("Missing values (%):")
print(df.isna().mean() * 100)

print("\nSummary statistics:")
print(df.describe())


Missing values (%):
country             0.0
iso_code            0.0
year                0.0
renewables_share    0.0
dtype: float64

Summary statistics:
              year  renewables_share
count  6395.000000       6395.000000
mean   2007.988898         29.554796
std      10.105714         31.926281
min    1985.000000          0.000000
25%    2001.000000          1.705077
50%    2009.000000         16.756989
75%    2016.000000         51.960640
max    2024.000000        100.000000


In [6]:
unusual = df[(df["renewables_share"] < 0) | (df["renewables_share"] > 100)]
unusual if len(unusual) else "No unusual values found"


'No unusual values found'

In [7]:
output_path = "../data/interim/renewables_share_clean.csv"
df.to_csv(output_path, index=False)

print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/renewables_share_clean.csv
