In [1]:
import pandas as pd
import sqlite3


In [2]:
db_path = "global_energy.db"
conn = sqlite3.connect(db_path)  # creates or opens the database file

In [20]:
df_gp = pd.read_csv("Data/global_power_plant_database.csv")

# Example: Keep only essential columns
keep_cols_gp = [
    "country",
    "country_long",
    "name",
    "latitude",
    "longitude",
    "primary_fuel",
    "capacity_mw",
    "commissioning_year",
    # Keep generation columns if desired:
    "generation_gwh_2013", "generation_gwh_2014", "generation_gwh_2015",
    "generation_gwh_2016", "generation_gwh_2017"
]

# Create a filtered DataFrame
# Global Power Plant (df_gp)


df_gp_filtered = df_gp[keep_cols_gp].copy()
df_gp_filtered = df_gp_filtered.dropna(axis='columns', how='all')
df_gp_filtered.rename(columns={"country": "country_short"}, inplace=True)

df_gp_filtered.rename(columns={"country_long": "country"}, inplace=True)
print(df_gp_filtered.head())
print(df_gp_filtered.columns)

  country_short      country  \
0           AFG  Afghanistan   
1           AFG  Afghanistan   
2           AFG  Afghanistan   
3           AFG  Afghanistan   
4           AFG  Afghanistan   

                                               name  latitude  longitude  \
0      Kajaki Hydroelectric Power Plant Afghanistan    32.322    65.1190   
1                                      Kandahar DOG    31.670    65.7950   
2                                      Kandahar JOL    31.623    65.7920   
3     Mahipar Hydroelectric Power Plant Afghanistan    34.556    69.4787   
4  Naghlu Dam Hydroelectric Power Plant Afghanistan    34.641    69.7170   

  primary_fuel  capacity_mw  commissioning_year  generation_gwh_2013  \
0        Hydro         33.0                 NaN                  NaN   
1        Solar         10.0                 NaN                  NaN   
2        Solar         10.0                 NaN                  NaN   
3        Hydro         66.0                 NaN               

  df_gp = pd.read_csv("Data/global_power_plant_database.csv")


In [4]:
df_1 = pd.read_csv("Data/worldbank.csv")

df_1 = df_1.dropna(axis='columns', how='all')


# Drop columns we definitely don't need
df_1.drop(columns=["Series Code"], inplace=True, errors="ignore")
# World Bank (df_1)
df_1.rename(columns={"Country Name": "country"}, inplace=True)
print(df_1.head())

print("Column names:\n", df_1.columns)

       country Country Code  \
0  Afghanistan          AFG   
1  Afghanistan          AFG   
2  Afghanistan          AFG   
3  Afghanistan          AFG   
4  Afghanistan          AFG   

                                         Series Name 1960 [YR1960]  \
0  Carbon dioxide (CO2) emissions from Building (...            ..   
1                              Coal rents (% of GDP)            ..   
2      Adjusted savings: energy depletion (% of GNI)            ..   
3   Adjusted savings: energy depletion (current US$)            ..   
4  Alternative and nuclear energy (% of total ene...            ..   

  1961 [YR1961] 1962 [YR1962] 1963 [YR1963] 1964 [YR1964] 1965 [YR1965]  \
0            ..            ..            ..            ..            ..   
1            ..            ..            ..            ..            ..   
2            ..            ..            ..            ..            ..   
3            ..            ..            ..            ..            ..   
4            ..  

In [5]:
# Load the Excel file
df_2 = pd.read_excel("Data/World Energy Balances Highlights 2024.xlsx",
                     sheet_name="TimeSeries_1971-2023", header=None)

# Drop completely empty columns
df_2 = df_2.dropna(axis='columns', how='all')

# Set the second row as the header
new_header = df_2.iloc[1]  # Get the second row
df_2 = df_2[2:]  # Drop the first two rows, keeping the first row as data
df_2.columns = new_header  # Set the new header
# IEA Balances (df_2)
df_2.rename(columns={"Country": "country"}, inplace=True)
# Reset the index for clarity
df_2.reset_index(drop=True, inplace=True)

# Display the DataFrame and column names
print(df_2.head())
print("Column names:\n", df_2.columns)

1    country                   Product                                   Flow  \
0  Australia  Coal, peat and oil shale                        Production (PJ)   
1  Australia  Coal, peat and oil shale                           Imports (PJ)   
2  Australia  Coal, peat and oil shale                           Exports (PJ)   
3  Australia  Coal, peat and oil shale               Total energy supply (PJ)   
4  Australia  Coal, peat and oil shale  Electricity, CHP and heat plants (PJ)   

1      NoCountry                     NoProduct  \
0  01. Australia  01. Coal, peat and oil shale   
1  01. Australia  01. Coal, peat and oil shale   
2  01. Australia  01. Coal, peat and oil shale   
3  01. Australia  01. Coal, peat and oil shale   
4  01. Australia  01. Coal, peat and oil shale   

1                                     NoFlow         1971         1972  \
0                        01. Production (PJ)  1368.251838  1648.414248   
1                           02. Imports (PJ)            0       

In [6]:
df_3 = pd.read_csv("Data/GreenGrowth_reduced.csv")
df_3 = df_3.dropna(axis='columns', how='all')
df_3.rename(columns={"Measure": "Measure_desc"}, inplace=True)
# OECD Green Growth (df_3)
df_3.rename(columns={"Reference area": "country"}, inplace=True)

print(df_3.head())
print("Column names:\n", df_3.columns)

  REF_AREA country      MEASURE                             Measure_desc  \
0      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
1      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
2      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
3      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
4      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   

   TIME_PERIOD  OBS_VALUE  
0         1990        NaN  
1         1991     247.00  
2         1992     237.96  
3         1993     193.19  
4         1994     180.58  
Column names:
 Index(['REF_AREA', 'country', 'MEASURE', 'Measure_desc', 'TIME_PERIOD',
       'OBS_VALUE'],
      dtype='object')


In [21]:
df_gp_filtered.to_sql(
    name="power_plants",
    con=conn,
    if_exists="replace",   # overwrite the table if it exists
    index=False            # don't write the DataFrame index as a column
)
print("Power Plant data loaded into 'power_plants' table.")

Power Plant data loaded into 'power_plants' table.


In [8]:
df_1.to_sql(
    name="world_bank",
    con=conn,
    if_exists="replace",
    index=False
)
print("World Bank data loaded into 'world_bank' table.")

World Bank data loaded into 'world_bank' table.


In [9]:
df_2.to_sql(
    name="iea_balances",
    con=conn,
    if_exists="replace",
    index=False
)
print("IEA Balances loaded into 'iea_balances' table.")

IEA Balances loaded into 'iea_balances' table.


In [10]:
df_3.to_sql(
    name="oecd_greengrowth",
    con=conn,
    if_exists="replace",
    index=False
)
print("Green Growth data loaded into 'oecd_greengrowth' table.")

Green Growth data loaded into 'oecd_greengrowth' table.


In [13]:
print(df_3.head())  # or df_3.sample(10)

  REF_AREA country      MEASURE                             Measure_desc  \
0      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
1      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
2      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
3      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   
4      LVA  Latvia  WATER_FWCAP  Total freshwater abstraction per capita   

   TIME_PERIOD  OBS_VALUE  
0         1990        NaN  
1         1991     247.00  
2         1992     237.96  
3         1993     193.19  
4         1994     180.58  


In [11]:
print(df_3.columns)

Index(['REF_AREA', 'country', 'MEASURE', 'Measure_desc', 'TIME_PERIOD',
       'OBS_VALUE'],
      dtype='object')


In [12]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in database:", tables)

Tables in database: [('country_capacity',), ('power_plants',), ('world_bank',), ('iea_balances',), ('oecd_greengrowth',)]
