In [591]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np

## CLEANING DATA

In [592]:
# Load the data into a pandas DataFrame
energy = pd.read_csv('CO2/energy.csv')

In [593]:
energy.head()

Unnamed: 0.1,Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
0,0,World,all_energy_types,1980,292.89979,296.337228,27770.910281,4298127.0,68.145921,10.547,4946.62713
1,1,World,coal,1980,78.656134,80.114194,27770.910281,4298127.0,68.145921,10.547,1409.790188
2,2,World,natural_gas,1980,53.865223,54.761046,27770.910281,4298127.0,68.145921,10.547,1081.593377
3,3,World,petroleum_n_other_liquids,1980,132.064019,133.111109,27770.910281,4298127.0,68.145921,10.547,2455.243565
4,4,World,nuclear,1980,7.5757,7.5757,27770.910281,4298127.0,68.145921,10.547,0.0


In [594]:
#Drop column
energy = energy.drop('Unnamed: 0', axis=1)

In [595]:
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55440 entries, 0 to 55439
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      55440 non-null  object 
 1   Energy_type                  55440 non-null  object 
 2   Year                         55440 non-null  int64  
 3   Energy_consumption           44287 non-null  float64
 4   Energy_production            44289 non-null  float64
 5   GDP                          40026 non-null  float64
 6   Population                   46014 non-null  float64
 7   Energy_intensity_per_capita  50358 non-null  float64
 8   Energy_intensity_by_GDP      50358 non-null  float64
 9   CO2_emission                 51614 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 4.2+ MB


In [596]:
energy.describe()

Unnamed: 0,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
count,55440.0,44287.0,44289.0,40026.0,46014.0,50358.0,50358.0,51614.0
mean,1999.5,1.537811,1.5327,827.144126,62630.2,71.898914,3.695104,78.800082
std,11.5435,15.456596,15.30356,5981.703144,456208.8,113.728738,4.590735,902.221463
min,1980.0,-0.163438,-1.0000000000000001e-39,0.124958,11.471,0.0,0.0,-0.00513
25%,1989.75,0.0,0.0,9.73778,1141.95,3.799939,0.899446,0.0
50%,1999.5,0.018381,0.0005121971,47.7571,6157.68,29.77926,2.987593,0.0
75%,2009.25,0.209422,0.112541,263.6871,20042.9,95.523627,4.969454,4.318822
max,2019.0,601.04049,611.509,127690.247059,7714631.0,1139.320598,166.913605,35584.933498


In [597]:
#check duplicates
energy.duplicated().sum()


0

In [598]:
#explore countries
energy['Country'].unique()

array(['World', 'Afghanistan', 'Albania', 'Algeria', 'American Samoa',
       'Angola', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burma', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Cayman Islands', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Congo-Brazzaville',
       'Congo-Kinshasa', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Côte d’Ivoire', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Falkland Islands', 'Faroe Islands',

In [599]:
energy[energy['Country'] == 'Former Yugoslavia']
energy[energy['Country'] == 'Former U.S.S.R.']
energy[energy['Country'] == 'Former Serbia and Montenegro']
energy[energy['Country'] == 'Hungary']
# energy[energy['Country'] == 'Poland']



Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
582,Hungary,all_energy_types,1980,1.168933,0.521931,,10711.540,109.128359,0.000000,
583,Hungary,coal,1980,0.227510,0.177160,,10711.540,109.128359,0.000000,
584,Hungary,natural_gas,1980,0.350880,0.224595,,10711.540,109.128359,0.000000,
585,Hungary,petroleum_n_other_liquids,1980,0.521554,0.119023,,10711.540,109.128359,0.000000,
586,Hungary,nuclear,1980,0.000000,0.000000,,10711.540,109.128359,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
54637,Hungary,coal,2019,0.087017,0.045387,309.9521,9770.737,108.211830,3.411202,9.423677
54638,Hungary,natural_gas,2019,0.367912,0.058079,309.9521,9770.737,108.211830,3.411202,19.169263
54639,Hungary,petroleum_n_other_liquids,2019,0.366582,0.048293,309.9521,9770.737,108.211830,3.411202,20.880865
54640,Hungary,nuclear,2019,0.155150,0.155150,309.9521,9770.737,108.211830,3.411202,0.000000


In [600]:
#Drop Countries that do not exist anymore
dropCountry= ['Former Czechoslovakia', 'Former Serbia and Montenegro','Former U.S.S.R.', 'Former Yugoslavia', 'Germany, East',
       'Germany, West', 'Hungary', 'Poland']

for country in dropCountry:
    value = energy[energy['Country']==country].index
    energy.drop(labels=value, axis=0, inplace=True)

In [601]:
energy[energy['Country'] == 'South Sudan']


Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
1146,South Sudan,all_energy_types,1980,,,,,,,
1147,South Sudan,coal,1980,,,,,,,
1148,South Sudan,natural_gas,1980,,,,,,,
1149,South Sudan,petroleum_n_other_liquids,1980,,,,,,,
1150,South Sudan,nuclear,1980,,,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...
55201,South Sudan,coal,2019,0.000000,0.000000,17.3349,11062.1,2.404033,1.534111,0.000000
55202,South Sudan,natural_gas,2019,0.000000,0.000000,17.3349,11062.1,2.404033,1.534111,0.000000
55203,South Sudan,petroleum_n_other_liquids,2019,0.026558,0.291543,17.3349,11062.1,2.404033,1.534111,1.675889
55204,South Sudan,nuclear,2019,,,17.3349,11062.1,2.404033,1.534111,0.000000


In [602]:
# Drop rows with 7 or less non NaN values. These are for countries that did not exist during certain years. 
energy.dropna(thresh=7, axis=0, inplace=True)

In [603]:
#check missing values
for item in energy:
    print(f'{item}: {energy[item].isnull().sum()}')

Country: 0
Energy_type: 0
Year: 0
Energy_consumption: 6292
Energy_production: 6291
GDP: 10202
Population: 4310
Energy_intensity_per_capita: 984
Energy_intensity_by_GDP: 984
CO2_emission: 1217


In [604]:
energy.groupby('Year')['CO2_emission'].count()

Year
1980     365
1981    1113
1982    1114
1983    1114
1984    1114
1985    1114
1986    1116
1987    1115
1988    1119
1989    1119
1990    1121
1991    1127
1992    1169
1993    1251
1994    1261
1995    1265
1996    1265
1997    1267
1998    1271
1999    1271
2000    1271
2001    1271
2002    1272
2003    1274
2004    1278
2005    1278
2006    1282
2007    1290
2008    1292
2009    1296
2010    1296
2011    1295
2012    1298
2013    1302
2014    1302
2015    1302
2016    1302
2017    1303
2018    1304
2019    1304
Name: CO2_emission, dtype: int64

In [605]:
#Dropped year 1980 because it only has a few CO2 emissions
# indexUS = energy[(energy['Year'] == 1980)].index
# energy.drop(indexUS , inplace=True)

In [606]:
#We need CO2 emissions, therefore dropped nulls under CO2_emission column. 
energy.dropna(subset=['CO2_emission'], inplace=True)

In [607]:
#Split nuclear from the rest of dataset
nuclear= energy[energy['Energy_type']== 'nuclear']
#Replace NaN to 0
nuclear[['Energy_consumption']] = nuclear[['Energy_consumption']].replace(np.nan, 0)
nuclear[['Energy_production']] = nuclear[['Energy_production']].replace(np.nan, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [608]:
nuclear.head()

Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
4,World,nuclear,1980,7.5757,7.5757,27770.910281,4298127.0,68.145921,10.547,0.0
10,Afghanistan,nuclear,1980,0.0,0.0,,13356.5,1.990283,0.0,0.0
16,Albania,nuclear,1980,0.0,0.0,,2682.7,60.752906,0.0,0.0
22,Algeria,nuclear,1980,0.0,0.0,,19221.7,40.615303,0.0,0.0
28,American Samoa,nuclear,1980,0.0,0.0,,32.646,180.515604,0.0,0.0


In [609]:

#The rest of the main dataset
energy2= energy[energy['Energy_type'] != 'nuclear']
#Combine the split datasets
energy= pd.concat([nuclear, energy2]).sort_index()


In [610]:
energy.head(20)

Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
0,World,all_energy_types,1980,292.89979,296.337228,27770.910281,4298127.0,68.145921,10.547,4946.62713
1,World,coal,1980,78.656134,80.114194,27770.910281,4298127.0,68.145921,10.547,1409.790188
2,World,natural_gas,1980,53.865223,54.761046,27770.910281,4298127.0,68.145921,10.547,1081.593377
3,World,petroleum_n_other_liquids,1980,132.064019,133.111109,27770.910281,4298127.0,68.145921,10.547,2455.243565
4,World,nuclear,1980,7.5757,7.5757,27770.910281,4298127.0,68.145921,10.547,0.0
5,World,renewables_n_other,1980,20.702344,20.775178,27770.910281,4298127.0,68.145921,10.547,0.0
10,Afghanistan,nuclear,1980,0.0,0.0,,13356.5,1.990283,0.0,0.0
11,Afghanistan,renewables_n_other,1980,0.007386,0.007386,,13356.5,1.990283,0.0,0.0
16,Albania,nuclear,1980,0.0,0.0,,2682.7,60.752906,0.0,0.0
17,Albania,renewables_n_other,1980,0.028897,0.030323,,2682.7,60.752906,0.0,0.0


In [611]:
energy['Country'].value_counts()

United States          240
World                  240
Burma                  236
Ethiopia               236
Sweden                 236
                      ... 
Serbia                  80
Montenegro              80
Kosovo                  68
South Sudan             44
Hawaiian Trade Zone     31
Name: Country, Length: 223, dtype: int64

In [612]:
#Group nulls under GDP & Country to find which countries have more than 90 GDP values. 90 is close to a 40% cutoff
gdpnull = energy['GDP'].isnull().groupby(energy['Country']).sum()
gdp_nulls= pd.DataFrame(gdpnull).reset_index()

In [613]:
GDP90 = gdp_nulls[gdp_nulls['GDP']>=90]
GDP90

Unnamed: 0,Country,GDP
3,American Samoa,134.0
5,Antarctica,196.0
45,Cook Islands,196.0
64,Falkland Islands,196.0
65,Faroe Islands,183.0
69,French Guiana,196.0
70,French Polynesia,196.0
76,Gibraltar,196.0
81,Guam,134.0
126,Micronesia,161.0


In [614]:
gdplist= [x for x in GDP90['Country']]

In [615]:
gdplist

['American Samoa',
 'Antarctica',
 'Cook Islands',
 'Falkland Islands',
 'Faroe Islands',
 'French Guiana',
 'French Polynesia',
 'Gibraltar',
 'Guam',
 'Micronesia',
 'Montserrat',
 'Nauru',
 'New Caledonia',
 'Niue',
 'Northern Mariana Islands',
 'Saint Helena',
 'Saint Pierre and Miquelon',
 'Turks and Caicos Islands',
 'Tuvalu',
 'U.S. Pacific Islands',
 'U.S. Territories',
 'Wake Island',
 'Western Sahara']

In [616]:
#Dropping countries with too many GDP nulls
for country in gdplist:
    value = energy[energy['Country']==country].index
    energy.drop(labels=value, axis=0, inplace=True)

In [617]:
energy.columns

Index(['Country', 'Energy_type', 'Year', 'Energy_consumption',
       'Energy_production', 'GDP', 'Population', 'Energy_intensity_per_capita',
       'Energy_intensity_by_GDP', 'CO2_emission'],
      dtype='object')

In [618]:
energy.groupby('Year')['Energy_consumption', 'Energy_production', 'GDP', 'Population', 'Energy_intensity_per_capita', 'Energy_intensity_by_GDP', 'CO2_emission'].count()

  energy.groupby('Year')['Energy_consumption', 'Energy_production', 'GDP', 'Population', 'Energy_intensity_per_capita', 'Energy_intensity_by_GDP', 'CO2_emission'].count()


Unnamed: 0_level_0,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980,336,336,109,331,331,331,336
1981,1003,1003,310,978,979,979,1003
1982,1003,1003,310,978,979,979,1003
1983,1003,1003,310,978,979,979,1003
1984,1003,1003,310,978,979,979,1003
1985,1003,1003,310,978,979,979,1003
1986,1005,1005,310,980,981,981,1005
1987,1004,1004,310,984,980,980,1004
1988,1008,1008,972,1008,984,984,1008
1989,1008,1008,972,1008,984,984,1008


In [619]:
#GDP is low for countries between 1980 and 1987. Excluding years before 1988
energy= energy[energy['Year']>= 1988]

In [620]:
energy.head()

Unnamed: 0,Country,Energy_type,Year,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission
11088,World,all_energy_types,1988,345.560876,347.412863,42106.595403,4927545.0,70.128405,8.206811,21163.840556
11089,World,coal,1988,96.873178,98.484482,42106.595403,4927545.0,70.128405,8.206811,8930.924825
11090,World,natural_gas,1988,71.010048,71.852938,42106.595403,4927545.0,70.128405,8.206811,3571.676242
11091,World,petroleum_n_other_liquids,1988,133.445814,132.485303,42106.595403,4927545.0,70.128405,8.206811,8661.23949
11092,World,nuclear,1988,19.226897,19.226897,42106.595403,4927545.0,70.128405,8.206811,0.0


In [621]:
energy['Energy_type'].value_counts()

renewables_n_other           6167
natural_gas                  6137
all_energy_types             6137
coal                         6137
petroleum_n_other_liquids    6132
nuclear                      6007
Name: Energy_type, dtype: int64

In [622]:
eip= pd.DataFrame(energy['Energy_intensity_by_GDP'].isnull().groupby(energy['Country']).sum()).reset_index()

eip= eip[eip['Energy_intensity_by_GDP'] > 0]

In [623]:
eip

Unnamed: 0,Country,Energy_intensity_by_GDP
70,Greenland,155.0
80,Iceland,160.0
124,Netherlands Antilles,160.0
180,Trinidad and Tobago,160.0
184,U.S. Virgin Islands,160.0


In [624]:
eiplist= [x for x in eip['Country']]

In [625]:
#Dropping countries from eiplist
for country in eiplist:
    value = energy[energy['Country']==country].index
    energy.drop(labels=value, axis=0, inplace=True)

In [626]:
#Rechecking missing values
for item in energy:
    print(f'{item}: {energy[item].isnull().sum()}')

Country: 0
Energy_type: 0
Year: 0
Energy_consumption: 0
Energy_production: 0
GDP: 128
Population: 0
Energy_intensity_per_capita: 0
Energy_intensity_by_GDP: 0
CO2_emission: 0


In [627]:
energy.shape

(35922, 10)

In [628]:
#Dropped missing values
energy = energy[pd.notnull(energy['GDP'])]

## ML - Random Forest

In [629]:
#Change categorical data 
df_getdummy=pd.get_dummies(data= energy, columns=['Country', 'Energy_type', 'Year'])
df_getdummy

Unnamed: 0,Energy_consumption,Energy_production,GDP,Population,Energy_intensity_per_capita,Energy_intensity_by_GDP,CO2_emission,Country_Afghanistan,Country_Albania,Country_Algeria,...,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019
11088,345.560876,347.412863,42106.595403,4.927545e+06,70.128405,8.206811,21163.840556,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11089,96.873178,98.484482,42106.595403,4.927545e+06,70.128405,8.206811,8930.924825,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11090,71.010048,71.852938,42106.595403,4.927545e+06,70.128405,8.206811,3571.676242,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11091,133.445814,132.485303,42106.595403,4.927545e+06,70.128405,8.206811,8661.239490,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11092,19.226897,19.226897,42106.595403,4.927545e+06,70.128405,8.206811,0.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55435,0.045064,0.075963,37.620400,1.465420e+04,11.508701,4.482962,4.586869,0,0,0,...,0,0,0,0,0,0,0,0,0,1
55436,0.000000,0.000000,37.620400,1.465420e+04,11.508701,4.482962,0.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
55437,0.055498,0.000000,37.620400,1.465420e+04,11.508701,4.482962,4.377890,0,0,0,...,0,0,0,0,0,0,0,0,0,1
55438,0.000000,0.000000,37.620400,1.465420e+04,11.508701,4.482962,0.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [630]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


# select the features and target
X = df_getdummy.drop(['CO2_emission'], axis=1)
y = df_getdummy['CO2_emission']


Mean Absolute Error:  3.1505957642378304


In [None]:

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# train the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [None]:

# make predictions on the test set
y_pred = rf.predict(X_test)


In [None]:

# evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error: ", mae)

In [632]:
rf.score(X_train,y_train)

0.9997839245809751

In [633]:
rf.score(X_test,y_test)

0.9985430892388596

## Analysis