In [31]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split as split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [43]:
df1 = pd.read_csv(r"/content/Energy Data.csv")
df2 = pd.read_csv(r"/content/cross-country-literacy-rates.csv")

# Standardize the country columns to ensure proper alignment
df1['country'] = df1['country'].str.strip().str.lower()
df2['Entity'] = df2['Entity'].str.strip().str.lower()

# Merge the dataframes on standardized country columns and year
C_df = pd.merge(df1, df2, left_on=["country", "year"], right_on=["Entity", "Year"], how="inner")
C_df = C_df.drop(columns=['Entity', 'Year', 'Code'])

# Check the combined dataframe
C_df.head()

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
1,ASEAN (Ember),2001,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
2,ASEAN (Ember),2002,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
3,ASEAN (Ember),2003,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
4,ASEAN (Ember),2004,,,,,,,,,...,0.0,,,,,,0.0,,0.0,


In [44]:
# Replace NaN values with the mean for each column
for col in C_df.columns:
    if pd.api.types.is_numeric_dtype(C_df[col]):  # Check if the column is numeric
       C_df[col] = C_df[col].fillna(C_df[col].mean())
C_df.head()

In [45]:
C_df.isna().sum()

In [46]:
data = C_df.copy()
data = data.drop_duplicates()
data = data.dropna(axis=1, how='all') #Remove Columns with All Empty Data
data = data.dropna(how='all') #Remove Rows with All Empty Data

data = data.drop(columns = [
    'biofuel_cons_change_pct','biofuel_cons_change_twh','biofuel_share_elec','biofuel_share_energy','biofuel_elec_per_capita','biofuel_cons_per_capita',
    'coal_cons_change_pct','coal_cons_change_twh','coal_cons_per_capita','coal_prod_change_pct','coal_prod_change_twh','coal_prod_per_capita','coal_elec_per_capita','coal_share_elec','coal_share_energy',
    'electricity_demand','electricity_generation','electricity_share_energy',
    'gas_cons_change_pct','gas_cons_change_twh','gas_prod_change_pct','gas_prod_change_twh','gas_share_elec','gas_share_energy','gas_elec_per_capita','gas_energy_per_capita','gas_prod_per_capita',
    'hydro_cons_change_pct','hydro_cons_change_twh','hydro_elec_per_capita','hydro_energy_per_capita','hydro_share_elec','hydro_share_energy',
    'nuclear_cons_change_pct','nuclear_cons_change_twh','nuclear_share_elec','nuclear_share_energy','nuclear_elec_per_capita','nuclear_energy_per_capita',
    'oil_cons_change_pct','oil_cons_change_twh','oil_prod_change_pct','oil_prod_change_twh','oil_share_elec','oil_share_energy','oil_elec_per_capita','oil_energy_per_capita','oil_prod_per_capita',
    'other_renewables_cons_change_pct','other_renewables_cons_change_twh','other_renewables_share_elec','other_renewables_share_energy','other_renewables_elec_per_capita_exc_biofuel','other_renewables_share_elec_exc_biofuel','other_renewables_elec_per_capita','other_renewables_energy_per_capita','other_renewable_exc_biofuel_electricity',
    'renewables_cons_change_pct','renewables_cons_change_twh','renewables_share_elec','renewables_share_energy','renewables_elec_per_capita','renewables_energy_per_capita',
    'solar_cons_change_pct','solar_cons_change_twh','solar_elec_per_capita','solar_share_elec','solar_share_energy','solar_energy_per_capita',
    'wind_cons_change_pct','wind_cons_change_twh','wind_elec_per_capita','wind_share_elec','wind_energy_per_capita','wind_share_energy',
    'fossil_share_energy','fossil_share_elec','fossil_cons_change_pct','fossil_cons_change_twh','fossil_elec_per_capita','fossil_energy_per_capita',
    'energy_cons_change_pct','energy_cons_change_twh',
    'low_carbon_cons_change_pct','low_carbon_cons_change_twh','low_carbon_share_elec','low_carbon_elec_per_capita','low_carbon_energy_per_capita','low_carbon_share_energy',
    'net_elec_imports','net_elec_imports_share_demand',
    'energy_per_capita','energy_per_gdp',
    'per_capita_electricity', 'carbon_intensity_elec', 'greenhouse_gas_emissions'
])

data = data[data['country'] != 'asean']
# C_df = C_df.drop(C_df.index[0:23])

In [None]:
C_df.isna().sum()

In [52]:
# Select only the columns related to consumption
consumption_columns = ['fossil_fuel_consumption','other_renewable_consumption','renewables_consumption','solar_consumption','wind_consumption']


# Group by 'Year' and sum all consumption-related columns
yearly_consumption = C_df.groupby('year')[consumption_columns].sum()

# Plot the data
fig, ax = plt.subplots(figsize=(15, 9))
fig.suptitle('Total Consumption by Year for All Types')

# Plot each type of Consumption
yearly_consumption.plot(ax=ax)

# Set axis labels and legend
ax.set_ylabel('Total Consumption')
ax.set_xlabel('Year')
ax.legend(title="Consumption Type")

# Show the plot
plt.show()

In [53]:
# List of columns to group and take the mean
columns_to_group = ['other_renewable_consumption', 'renewables_consumption', 'solar_consumption', 'wind_consumption']

# Ensure that the columns are present in the DataFrame
available_columns = [col for col in columns_to_group if col in C_df.columns]

# Group by 'year' and compute the mean for the specified columns
grouped_means = C_df.groupby('year')[available_columns].mean().reset_index()

# Create a new feature 'clean_consumption' by summing the means of clean energy sources
grouped_means['clean_consumption'] = grouped_means[available_columns].sum(axis=1)

# Merge the 'clean_consumption' feature back into the original DataFrame
C_df = pd.merge(C_df, grouped_means[['year', 'clean_consumption']], on='year', how='left')

# Verify the new feature
print(C_df[['year', 'clean_consumption']].head())

In [54]:
# Define features (X) and target (y)
X = C_df[['fossil_fuel_consumption']]
y = C_df['clean_consumption']

In [55]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=42)

In [57]:
# One-hot encoding for categorical variables
X = pd.get_dummies(X, columns=['fossil_fuel_consumption'], drop_first=True)

In [58]:
#Feature scaling for StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X.shape

In [59]:
# k-NN Regressor for StandardScaler
print("\nKNN Regression with StandardScaler:")
for k in range(1,11):
  knn1 = KNeighborsRegressor(n_neighbors=k).fit(X_train_scaled, y_train)
  print(f"KNN k={k}:R2={knn1.score(X_test_scaled, y_test):.3f}")

# Ridge Regression with StandardScaler
print("\nRidge Regression with StandardScaler:")
for x in range(0, 100, 10):
    ridge = Ridge(alpha=x).fit(X_train_scaled, y_train)
    print(f"Ridge alpha={x}: R2={ridge.score(X_test_scaled, y_test):.3f}")

In [None]:
# Feature scaling with MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled1 = scaler.fit_transform(X_train)
X_test_scaled1 = scaler.transform(X_test)

# k-NN Regressor for MinMaxScaler
print("\nKNN Regression with MinMaxScaler:")
for k in range(1,11):
  knn1 = KNeighborsRegressor(n_neighbors=k).fit(X_train_scaled1, y_train)
  print(f"KNN k={k}:R2={knn1.score(X_test_scaled1, y_test):.3f}")

# Ridge Regression with MinMaxScaler
print("\nRidge Regression with MinMaxScaler:")
for x in range(0, 100, 10):
    ridge = Ridge(alpha=x).fit(X_train_scaled1, y_train)
    print(f"Ridge alpha={x}: R2={ridge.score(X_test_scaled1, y_test):.3f}")

In [None]:
# Feature scaling with RobustScaler
scaler = RobustScaler()
X_train_Rscaled = scaler.fit_transform(X_train)
X_test_Rscaled = scaler.transform(X_test)

# KNN Regression with RobustScaler
print("\nKNN Regression with RobustScaler:")
for k in range(1, 11):  # Loop over different k values
    knn1 = KNeighborsRegressor(n_neighbors=k).fit(X_train_Rscaled, y_train)
    r2_knn = knn1.score(X_test_Rscaled, y_test)
    print(f"KNN k={k}: R2={r2_knn:.3f}")

# Ridge Regression with RobustScaler
print("\nRidge Regression with RobustScaler:")
for x in range(0, 100, 10):  # Start from alpha=1
    ridge1 = Ridge(alpha=x).fit(X_train_Rscaled, y_train)
    r2_ridge = ridge1.score(X_test_Rscaled, y_test)
    print(f"Ridge alpha={x}: R2={r2_ridge:.3f}")

In [None]:
X = C_df[["country", "year", "population", "gdp", "Literacy rate"]]
y = C_df["fossil_fuel_consumption"]

X_train, X_test, y_train, y_test = split(X, y, test_size=0.1, random_state=42)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_enc = encoder.fit_transform(X_train[['country']]).toarray()
X_train_enc = pd.DataFrame(X_train_enc,
                           columns = encoder.get_feature_names_out(['country']),
                           index = X_train.index)

X_test_enc = encoder.transform(X_test[['country']]).toarray()
X_test_enc = pd.DataFrame(X_test_enc,
                          columns = encoder.get_feature_names_out(['country']),
                          index = X_test.index)

X_train = pd.concat([X_train.drop(columns=['country']), X_train_enc], axis = 1)
X_test = pd.concat([X_test.drop(columns=['country']), X_test_enc], axis = 1)

pca = PCA(n_components=5)
X_train = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index)
X_test = pd.DataFrame(pca.transform(X_test), index = X_test.index)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),
                       columns = X_train.columns,
                       index = X_train.index)

X_test = pd.DataFrame(scaler.transform(X_test),
                      columns = X_test.columns,
                      index = X_test.index)


In [None]:
# Define models
models = {}
models['lr'] = LinearRegression()
models['knn'] = KNeighborsRegressor()
models['ridge'] = Ridge()
models['svr'] = SVR()
models['dt'] = DecisionTreeRegressor()
models['rf'] = RandomForestRegressor()
models['gb'] = GradientBoostingRegressor()
models['mlp'] = MLPRegressor(max_iter=100000)

for n in models:
    models[n].fit(X_train, y_train)

    r2 = models[n].score(X_train, y_train)
    print(n, r2)

In [None]:
X = C_df[["country", "year", "population", "gdp", "Literacy rate"]]
y = C_df[["solar_consumption", "wind_consumption"]].sum(axis = 1) / 2

X_train, X_test, y_train, y_test = split(X, y, test_size=0.1, random_state=42)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_enc = encoder.fit_transform(X_train[['country']]).toarray()
X_train_enc = pd.DataFrame(X_train_enc,
                           columns = encoder.get_feature_names_out(['country']),
                           index = X_train.index)

X_test_enc = encoder.transform(X_test[['country']]).toarray()
X_test_enc = pd.DataFrame(X_test_enc,
                          columns = encoder.get_feature_names_out(['country']),
                          index = X_test.index)

X_train = pd.concat([X_train.drop(columns=['country']), X_train_enc], axis = 1)
X_test = pd.concat([X_test.drop(columns=['country']), X_test_enc], axis = 1)

pca = PCA(n_components=5)
X_train = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index)
X_test = pd.DataFrame(pca.transform(X_test), index = X_test.index)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),
                       columns = X_train.columns,
                       index = X_train.index)

X_test = pd.DataFrame(scaler.transform(X_test),
                      columns = X_test.columns,
                      index = X_test.index)

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
r2 = model.score(X_train, y_train)

mae = mean_absolute_error(y_test, model.predict(X_test))
mse = mean_squared_error(y_test, model.predict(X_test))
rmse = mean_squared_error(y_test, model.predict(X_test)) ** (1 / 2)

print(f"R2: {r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

In [None]:
# Define features (X) and target variable (y)
X = C_df[["year", "population", "gdp"]]  # Removed country for simplicity
y = C_df["Literacy rate"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Initialize and train a DecisionTreeRegressor model
model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = model.score(X_test_scaled, y_test)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")
