#1. ***Introduction***



With the growing demand for energy worldwide, it is crucial to analyze global energy consumption trends, identify regional disparities, and forecast future energy needs. This project aims to use exploratory data analysis (EDA), data visualization, and machine learning models to predict future energy consumption trends and support sustainable energy policies.

# 2.**Goal of the Project**

World Energy Consumption Treand



#3.***Features/Columns***

Instance rows:22012 rows
Feature columns:129 columns


#4.**Data Preprocessing**




**1.Importing Libraries**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ***Loading Dataset***


In [4]:
df = pd.read_csv("/content/World Energy Consumption.csv")
df


Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
1,ASEAN (Ember),2001,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
2,ASEAN (Ember),2002,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
3,ASEAN (Ember),2003,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
4,ASEAN (Ember),2004,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22007,Zimbabwe,2018,ZWE,15052191.0,2.271535e+10,,,,,25.910,...,0.218,,,,,0.0,0.0,,0.0,
22008,Zimbabwe,2019,ZWE,15354606.0,,,,,,24.748,...,0.364,,,,,0.0,0.0,,0.0,
22009,Zimbabwe,2020,ZWE,15669663.0,,,,,,22.336,...,0.395,,,,,0.0,0.0,,0.0,
22010,Zimbabwe,2021,ZWE,15993525.0,,,,,,23.760,...,0.498,,,,,0.0,0.0,,0.0,


#1. **Understanding the data**




In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22012 entries, 0 to 22011
Columns: 129 entries, country to wind_share_energy
dtypes: float64(126), int64(1), object(2)
memory usage: 21.7+ MB
None


In [6]:
print(df.head())

         country  year iso_code  population  gdp  biofuel_cons_change_pct  \
0  ASEAN (Ember)  2000      NaN         NaN  NaN                      NaN   
1  ASEAN (Ember)  2001      NaN         NaN  NaN                      NaN   
2  ASEAN (Ember)  2002      NaN         NaN  NaN                      NaN   
3  ASEAN (Ember)  2003      NaN         NaN  NaN                      NaN   
4  ASEAN (Ember)  2004      NaN         NaN  NaN                      NaN   

   biofuel_cons_change_twh  biofuel_cons_per_capita  biofuel_consumption  \
0                      NaN                      NaN                  NaN   
1                      NaN                      NaN                  NaN   
2                      NaN                      NaN                  NaN   
3                      NaN                      NaN                  NaN   
4                      NaN                      NaN                  NaN   

   biofuel_elec_per_capita  ...  solar_share_elec  solar_share_energy  \
0      

In [7]:
print(df.isnull().sum())

country                       0
year                          0
iso_code                   5500
population                 3889
gdp                       10899
                          ...  
wind_elec_per_capita      14947
wind_electricity          14016
wind_energy_per_capita    17947
wind_share_elec           15126
wind_share_energy         17911
Length: 129, dtype: int64


In [8]:
df.shape

(22012, 129)

In [9]:
df.columns

Index(['country', 'year', 'iso_code', 'population', 'gdp',
       'biofuel_cons_change_pct', 'biofuel_cons_change_twh',
       'biofuel_cons_per_capita', 'biofuel_consumption',
       'biofuel_elec_per_capita',
       ...
       'solar_share_elec', 'solar_share_energy', 'wind_cons_change_pct',
       'wind_cons_change_twh', 'wind_consumption', 'wind_elec_per_capita',
       'wind_electricity', 'wind_energy_per_capita', 'wind_share_elec',
       'wind_share_energy'],
      dtype='object', length=129)

In [10]:
df.duplicated().sum()

np.int64(0)

#2. **Data cleaning**

In [11]:
# Check missing values
df.isnull().sum()

Unnamed: 0,0
country,0
year,0
iso_code,5500
population,3889
gdp,10899
...,...
wind_elec_per_capita,14947
wind_electricity,14016
wind_energy_per_capita,17947
wind_share_elec,15126


In [12]:
# Fill missing values
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))


In [13]:
# Convert year to datetime format
if 'year' in df.columns:
    df['year'] = pd.to_datetime(df['year'], format='%Y')



In [14]:
# Remove duplicates
df.drop_duplicates(inplace=True)
df


Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000-01-01,AGO,6991543.0,4.167411e+10,8.179,0.0,15.482,0.673,0.146,...,0.000,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
1,ASEAN (Ember),2001-01-01,AGO,6991543.0,4.167411e+10,8.179,0.0,15.482,0.673,0.146,...,0.000,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
2,ASEAN (Ember),2002-01-01,AGO,6991543.0,4.167411e+10,8.179,0.0,15.482,0.673,0.146,...,0.000,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
3,ASEAN (Ember),2003-01-01,AGO,6991543.0,4.167411e+10,8.179,0.0,15.482,0.673,0.146,...,0.000,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
4,ASEAN (Ember),2004-01-01,AGO,6991543.0,4.167411e+10,8.179,0.0,15.482,0.673,0.146,...,0.000,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22007,Zimbabwe,2018-01-01,ZWE,15052191.0,2.271535e+10,8.179,0.0,15.482,0.673,25.910,...,0.218,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
22008,Zimbabwe,2019-01-01,ZWE,15354606.0,4.167411e+10,8.179,0.0,15.482,0.673,24.748,...,0.364,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
22009,Zimbabwe,2020-01-01,ZWE,15669663.0,4.167411e+10,8.179,0.0,15.482,0.673,22.336,...,0.395,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
22010,Zimbabwe,2021-01-01,ZWE,15993525.0,4.167411e+10,8.179,0.0,15.482,0.673,23.760,...,0.498,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003


# 3️.**Exploratory Data Analysis**

In [15]:
# Summary statistics
print(df.describe())


                                year    population           gdp  \
count                          22012  2.201200e+04  2.201200e+04   
mean   1974-03-19 18:28:58.833363616  8.793914e+07  2.016329e+11   
min              1900-01-01 00:00:00  1.833000e+03  1.642060e+08   
25%              1946-01-01 00:00:00  2.749202e+06  4.100318e+10   
50%              1984-01-01 00:00:00  6.991543e+06  4.167411e+10   
75%              2003-01-01 00:00:00  1.800519e+07  4.258133e+10   
max              2022-01-01 00:00:00  7.975105e+09  1.136300e+14   
std                              NaN  4.227267e+08  1.720499e+12   

       biofuel_cons_change_pct  biofuel_cons_change_twh  \
count             22012.000000             22012.000000   
mean                 11.187971                 0.331917   
min                -100.000000               -50.843000   
25%                   8.179000                 0.000000   
50%                   8.179000                 0.000000   
75%                   8.179000   

In [16]:

# Unique energy sources
if 'energy_source' in df.columns:
    print(df['energy_source'].unique())



In [17]:
 # Total energy consumption by year
if 'year' in df.columns and 'total_energy_consumption' in df.columns:
    energy_consumption_by_year = df.groupby('year')['total_energy_consumption'].sum()
    print(energy_consumption_by_year)

#  4: Data Visualization

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
print(df.columns)


Index(['country', 'year', 'iso_code', 'population', 'gdp',
       'biofuel_cons_change_pct', 'biofuel_cons_change_twh',
       'biofuel_cons_per_capita', 'biofuel_consumption',
       'biofuel_elec_per_capita',
       ...
       'solar_share_elec', 'solar_share_energy', 'wind_cons_change_pct',
       'wind_cons_change_twh', 'wind_consumption', 'wind_elec_per_capita',
       'wind_electricity', 'wind_energy_per_capita', 'wind_share_elec',
       'wind_share_energy'],
      dtype='object', length=129)


In [None]:
# Line chart: Global Energy Consumption over time
plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x='year', y='biofuel_consumption')

plt.title("Global Energy Consumption Over Time")
plt.xlabel("Year")
plt.ylabel("Energy Consumption (TWh)")
plt.show()



In [None]:
if 'region' in df.columns:
    df_region = df.groupby('region')['energy_consumption'].sum().reset_index()

plt.figure(figsize=(12,10))
sns.scatterplot(data=df, x='country', y='biofuel_elec_per_capita')
plt.title("Energy Consumption by Region")
plt.xlabel("country")
plt.ylabel("Energy Consumption (TWh)")
plt.show()


In [None]:
# Visualizing missing values
plt.figure(figsize=(12, 5))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Distribution of Primary Energy Consumption
plt.figure(figsize=(10, 5))
sns.histplot(df['primary_energy_consumption'], bins=30, kde=True)
plt.xlabel('Primary Energy Consumption (TWh)')
plt.title('Distribution of Primary Energy Consumption')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Define X and y
X = df.drop(columns=['energy_source'], errors='ignore')  # Replace with your target column
# Assuming 'primary_energy_consumption' is a valid column in your DataFrame:
y = df['primary_energy_consumption']  # Replace with actual target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train and Test data successfully created!")


In [None]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply Label Encoding
le = LabelEncoder()
for col in categorical_cols:
    X_train[col] = le.fit_transform(X_train[col].astype(str))  # Convert to string before encoding
    X_test[col] = le.transform(X_test[col].astype(str))  # Apply same encoding to test data|

In [None]:
import numpy as np
y = np.log1p(y)  # Log transformation to reduce large numbers


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1️⃣ Handle Categorical Columns
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# 👉 Convert 'year' to numeric (e.g., ordinal) before scaling
if 'year' in X.columns:
    X['year'] = X['year'].dt.year  # Extract year as a number

# 2️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3️⃣ Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4️⃣ Train Model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 5️⃣ Make Predictions
y_pred = model.predict(X_test_scaled)

# 6️⃣ Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:\nMAE: {mae}\nMSE: {mse}\nR² Score: {r2}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1️⃣ Handle Categorical Columns
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# 👉 Convert 'year' to numeric
if 'year' in X.columns and X['year'].dtype == 'datetime64[ns]':
    X['year'] = X['year'].dt.year

# 2️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3️⃣ Feature Scaling (optional for trees, but okay to include)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4️⃣ Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# 5️⃣ Make Predictions
y_pred = rf_model.predict(X_test_scaled)

# 6️⃣ Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest Performance:\nMAE: {mae:.4f}\nMSE: {mse:.4f}\nR² Score: {r2:.4f}")


In [None]:
df.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(data=X)
plt.xticks(rotation=90)
plt.show()


In [None]:
from scipy.stats import zscore
import numpy as np

# Compute Z-scores for all numerical columns
z_scores = np.abs(zscore(X))

# Keep only rows where Z-score < 3 for all columns
X_clean = X[(z_scores < 3).all(axis=1)]
y_clean = y.loc[X_clean.index]  # Ensure target matches cleaned data

print(f"Before cleaning: {X.shape[0]} rows")
print(f"After cleaning: {X_clean.shape[0]} rows")
