# The Effect of Population and Economic Characteristics on CO2 Emissions W13

In [0]:
# Import block

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [0]:
# This cell is used for getting the files that are stored locally

# path = 'PATH'

In [2]:
# This cell is used for importing the files from google drive.

from google.colab import drive
drive.mount('/content/drive')

path = 'drive/My Drive/Colab Notebooks/IDSProject/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Function for reading and cleaning data
def readNClean(name): 
  my_cols = [str(i) for i in range(65)] # create some row names
  data = pd.read_csv(path+name,names=my_cols,header=None, engine="python")

  df = data.copy()

  df = df.iloc[2:]
  df.reset_index(drop=True, inplace=True)



  #dropime ebavajalikud indikaatorid 
  df.drop(df.iloc[:, 1:4], inplace = True, axis = 1)

  df = df.T.set_index(0).T

  #Transpose DateFrame - Columns to rows
  df = df.T


  #1st row to header
  df.index.names = [None]
  new_header = df.iloc[0]
  df = df[1:]
  df.columns = new_header
  
  #Remvoing Empty row 
  df.drop(df.tail(1).index,inplace=True)
  #Rounding the values
  df = df.astype(float).round(4)

  return(df)



In [0]:
#Reading and cleaning the data.

AccessToElectricity = readNClean("AccessToElectricityTotal.csv") 
AgriculturalLand = readNClean("AgriculturalLand.csv") 
ForestArea = readNClean("ForeastArea.csv")
AirPolutionMicrogramsPerCubicMeter = readNClean("AirPolutionMicrogramsPerCubicMeter.csv") 
AirTransportFreight = readNClean("AirTransportFreight.csv") 
RailwayFreight = readNClean("RailwayFreight.csv")
CO2 = readNClean("CO2Emission.csv") 
EmploymentInAgriculture = readNClean("EmpAgroculturePercOfTotalILO.csv")   # spelling mistake in file name
EmploymentInIndustry = readNClean("EmpIndustryPercOfTotalILO.csv")
EmploymentInServices = readNClean("EmpServicesPercOfTotalILO.csv") 
Unemployment = readNClean("TotalUnemploymentPerc.csv")
EnergyFromCoal = readNClean("EnergyCoal.csv") 
ElectricPowerTransprotLoss = readNClean("ElectricPowerTransprotLossPerc.csv")
EnergyConsumption = readNClean("EnergyConsuption.csv")
EnergyImports = readNClean("EnergyImportsPercOfTotalUsed.csv")
EnergyFromNaturalGas = readNClean("EnergyNaturalGas.csv")
EnergyFromOil = readNClean("EnergyOil.csv")
EnergyFromRenewable = readNClean("EnergyRenewable.csv")
EnergyFromHydro = readNClean("HydroEnergy.csv")
EnergyFromNuclear = readNClean("NuclearEnergy.csv")
ExportPercentageOfGDP = readNClean("ExportsPercOfGDP.csv")
LogisticsPerformanceIndex = readNClean("LogisticsPerformanceIndex.csv")
TotalPopulation = readNClean("TotalPopulation.csv") 
PopulationDensity = readNClean("PopDensityPerSqrkm.csv")
PopulationLivingInSlums = readNClean("PopLivingInSLums.csv")
PriceOfFuel = readNClean("PriceOfFuelUSDPerLiter.csv")
NumberOfTourists = readNClean("TourismNumberOfArrivals.csv")
UrbanPopulation = readNClean("UrbanPopulation.csv")

In [0]:
# In this cell we deffine functions for finding correlation strengths

# All Info about one country
def byCountry(name):

  country = pd.DataFrame(columns=['Year','TotalPopulation','PopulationDensity','UrbanPopulation','PopulationLivingInSlums',
                                  'EmploymentInAgriculture','EmploymentInIndustry','EmploymentInServices','Unemployment',
                                  'NumberOfTourists','PriceOfFuel','EnergyConsumption','EnergyImports','EnergyFromCoal',
                                  'EnergyFromNaturalGas','EnergyFromOil','EnergyFromRenewable','EnergyFromHydro','EnergyFromNuclear',
                                  'ElectricPowerTransprotLoss','AccessToElectricity','AirTransportFreight','RailwayFreight','LogisticsPerformanceIndex',
                                  'ExportPercentageOfGDP','AgriculturalLand','ForestArea','AirPolutionMicrogramsPerCubicMeter','CO2'])

  country.Year = TotalPopulation.index
  country.TotalPopulation = list(TotalPopulation[name])
  country.PopulationDensity = list(PopulationDensity[name])
  country.UrbanPopulation = list(UrbanPopulation[name])
  country.PopulationLivingInSlums = list(PopulationLivingInSlums[name])
  country.EmploymentInAgriculture = list(EmploymentInAgriculture[name])
  country.EmploymentInIndustry = list(EmploymentInIndustry[name])
  country.EmploymentInServices = list(EmploymentInServices[name])
  country.Unemployment = list(Unemployment[name])
  country.NumberOfTourists = list(NumberOfTourists[name])
  country.PriceOfFuel = list(PriceOfFuel[name])
  country.EnergyConsumption = list(EnergyConsumption[name])
  country.EnergyImports = list(EnergyImports[name])
  country.EnergyFromCoal = list(EnergyFromCoal[name])
  country.EnergyFromNaturalGas = list(EnergyFromNaturalGas[name])
  country.EnergyFromOil = list(EnergyFromOil[name])
  country.EnergyFromRenewable = list(EnergyFromRenewable[name])
  country.EnergyFromHydro = list(EnergyFromHydro[name])
  country.EnergyFromNuclear = list(EnergyFromNuclear[name])
  country.ElectricPowerTransprotLoss = list(ElectricPowerTransprotLoss[name])
  country.AccessToElectricity = list(AccessToElectricity[name])
  country.AirTransportFreight = list(AirTransportFreight[name])
  country.RailwayFreight = list(RailwayFreight[name])
  country.LogisticsPerformanceIndex = list(LogisticsPerformanceIndex[name])
  country.ExportPercentageOfGDP = list(ExportPercentageOfGDP[name])
  country.AgriculturalLand = list(AgriculturalLand[name])
  country.ForestArea = list(ForestArea[name])
  country.AirPolutionMicrogramsPerCubicMeter = list(AirPolutionMicrogramsPerCubicMeter[name])
  country.CO2 = list(CO2[name])

  return country

# Returns a list of the countrynames
def getCountrys():
  return list(TotalPopulation.columns)

#Returns correlation absolute values organized
def getCorr(df):
  correl = df.corr(method='pearson')
  correl = correl.drop(correl.loc[:, 'TotalPopulation':'AirPolutionMicrogramsPerCubicMeter'].columns, axis = 1)
  correl = correl.abs().sort_values(by='CO2')

  return correl

# Add score based on the correlation value, Strongest gets most weakest gets least
def addCorrScore(df):
  scores = []
  points = 0
  for i in list(df.CO2):
    if np.isnan(i):
      scores.append(0)
    else:
      scores.append(points)
    points += 1    

  df.insert(0,'score',scores)
  df = df.drop('CO2',1)
  
  return df
 
# Adds together correlation scores for two dataframes
def addScores(df1,df2):
  df1.score = df1.score + df2.score 

  return df1

# In order to add weights to the classes, we will need to find out what the 
# strongest  correlations are globally so our model is as broad as possible.
def getGlobalScores():
  df = pd.DataFrame()

  for i in getCountrys():
    if df.empty:
      df = addCorrScore(getCorr(byCountry(i)))
    else:
      df = addScores(df,addCorrScore(getCorr(byCountry(i))))

  df = df.sort_values(by='score')
  df.drop(df.tail(1).index,inplace=True)
  return df


In [221]:
# This Cell is used to dispaly the correlations globally

getGlobalScores()

Unnamed: 0,score
EnergyFromNuclear,618
EnergyFromCoal,1046
RailwayFreight,1360
EnergyFromRenewable,1381
EnergyFromOil,1448
EnergyFromNaturalGas,1466
ElectricPowerTransprotLoss,1504
EnergyFromHydro,1528
Unemployment,1618
EnergyImports,1660


In [0]:
#In this cell we deffine the function to organize the learning and testing data

# Finding change in one value over years
def getChange(value, year1, year2):
  df = pd.DataFrame(columns = ["dif"])
  df.dif = value.loc[year2] - value.loc[year1]

  return df


# Combining all the changed values into one dataframe
def changeOverTime(year1,year2):
  changeOverTime = pd.DataFrame(columns=['Country','TotalPopulation','PopulationDensity','UrbanPopulation','PopulationLivingInSlums',
                                 'EmploymentInAgriculture','EmploymentInIndustry','EmploymentInServices','Unemployment',
                                 'NumberOfTourists','PriceOfFuel','EnergyConsumption','EnergyImports','EnergyFromCoal',
                                 'EnergyFromNaturalGas','EnergyFromOil','EnergyFromRenewable','EnergyFromHydro','EnergyFromNuclear',
                                 'ElectricPowerTransprotLoss','AccessToElectricity','AirTransportFreight','RailwayFreight','LogisticsPerformanceIndex',
                                 'ExportPercentageOfGDP','AgriculturalLand','ForestArea','AirPolutionMicrogramsPerCubicMeter','CO2'])

  #Since the number and order of years and countrys is same in all the dataframes, this simple but dirty method works.
  #If the order was mixed values should be connected by row index or column name

  changeOverTime.Country = list(TotalPopulation.columns)
  changeOverTime.TotalPopulation = list(getChange(TotalPopulation,year1,year2).dif)
  changeOverTime.PopulationDensity = list(getChange(PopulationDensity,year1,year2).dif)
  changeOverTime.UrbanPopulation = list(getChange(UrbanPopulation,year1,year2).dif)
  changeOverTime.PopulationLivingInSlums = list(getChange(PopulationLivingInSlums,year1,year2).dif)
  changeOverTime.EmploymentInAgriculture = list(getChange(EmploymentInAgriculture,year1,year2).dif)
  changeOverTime.EmploymentInIndustry = list(getChange(EmploymentInIndustry,year1,year2).dif)
  changeOverTime.EmploymentInServices = list(getChange(EmploymentInServices,year1,year2).dif)
  changeOverTime.Unemployment = list(getChange(Unemployment,year1,year2).dif)
  changeOverTime.NumberOfTourists = list(getChange(NumberOfTourists,year1,year2).dif)
  changeOverTime.PriceOfFuel = list(getChange(PriceOfFuel,year1,year2).dif)
  changeOverTime.EnergyConsumption = list(getChange(EnergyConsumption,year1,year2).dif)
  changeOverTime.EnergyImports = list(getChange(EnergyImports,year1,year2).dif)
  changeOverTime.EnergyFromCoal = list(getChange(EnergyFromCoal,year1,year2).dif)
  changeOverTime.EnergyFromNaturalGas = list(getChange(EnergyFromNaturalGas,year1,year2).dif)
  changeOverTime.EnergyFromOil = list(getChange(EnergyFromOil,year1,year2).dif)
  changeOverTime.EnergyFromRenewable = list(getChange(EnergyFromRenewable,year1,year2).dif)
  changeOverTime.EnergyFromHydro = list(getChange(EnergyFromHydro,year1,year2).dif)
  changeOverTime.EnergyFromNuclear = list(getChange(EnergyFromNuclear,year1,year2).dif)
  changeOverTime.ElectricPowerTransprotLoss = list(getChange(ElectricPowerTransprotLoss,year1,year2).dif)
  changeOverTime.AccessToElectricity = list(getChange(AccessToElectricity,year1,year2).dif)
  changeOverTime.AirTransportFreight = list(getChange(AirTransportFreight,year1,year2).dif)
  changeOverTime.RailwayFreight = list(getChange(RailwayFreight,year1,year2).dif)
  changeOverTime.LogisticsPerformanceIndex = list(getChange(LogisticsPerformanceIndex,year1,year2).dif)
  changeOverTime.ExportPercentageOfGDP = list(getChange(ExportPercentageOfGDP,year1,year2).dif)
  changeOverTime.AgriculturalLand = list(getChange(AgriculturalLand,year1,year2).dif)
  changeOverTime.ForestArea = list(getChange(ForestArea,year1,year2).dif)
  changeOverTime.AirPolutionMicrogramsPerCubicMeter = list(getChange(AirPolutionMicrogramsPerCubicMeter,year1,year2).dif)
  changeOverTime.CO2 = list(getChange(CO2,year1,year2).dif)

  increase = []
  decrease = []

  for i in changeOverTime.CO2:
    if abs(i) < 100:
      increase.append(0)
      decrease.append(0)
    elif i > 0:
      increase.append(1)
      decrease.append(0)
    else:
      increase.append(0)
      decrease.append(1)  

  changeOverTime.insert(0,"CO2_Increase", increase)
  changeOverTime.insert(1,"CO2_Decrease", decrease)
  changeOverTime = changeOverTime.drop('CO2',1)
  return changeOverTime
  

In [206]:
#In this ccell we calcualte the weights of the classes from the Scores.

# This Cell did not get directly implemented in the machine learning

globalScores = getGlobalScores()

# We divide by the score of the element with the smalelst score to get the relative weights
def getWeights(df):
  weights = []
  for i in df.score:
    weights.append(i/618)

  return weights

#Displaying the weights to illustrate which class has the strongest found correlation
weights = getWeights(globalScores)
globalScores.insert(1,"Weight",weights)
globalScores = globalScores.drop('score',1)
globalScores



Unnamed: 0,Weight
EnergyFromNuclear,1.0
EnergyFromCoal,1.692557
RailwayFreight,2.200647
EnergyFromRenewable,2.234628
EnergyFromOil,2.343042
EnergyFromNaturalGas,2.372168
ElectricPowerTransprotLoss,2.433657
EnergyFromHydro,2.472492
Unemployment,2.618123
EnergyImports,2.686084


In [0]:
# In this ccell we create machine learning model

# Deffining the inital testing and learnign data
train = changeOverTime(2009.0,2014.0).fillna(0)
train = train.drop('Country',1)
test = changeOverTime(1994.0,2014.0).fillna(0)
testLabel1 = test.CO2_Increase
testLabel2 = test.CO2_Decrease
test = test.drop(['CO2_Increase','CO2_Decrease','Country'],1)
y_train = train[['CO2_Increase','CO2_Decrease']]
x_train = train.drop(['CO2_Increase','CO2_Decrease'], axis=1)

# Additional testing data, however further additions to the testing data
# resulted in a large drop in accuracy because of overfitting
train2 = changeOverTime(2004.0,2014.0).fillna(0)
train2 = train2.drop('Country',1)
y_train2 = train2[['CO2_Increase','CO2_Decrease']]
x_train2 = train2.drop(['CO2_Increase','CO2_Decrease'], axis=1)

#Fitting the data and making the prediction
rf = RandomForestClassifier(n_estimators=10000,criterion='entropy')
rf.fit(x_train,y_train)
rf.fit(x_train2,y_train2)
pred = rf.predict(test)

In [0]:
# This cell contains functions that help to check the accuracy of the machine learning model

def getIncreases(pred):
  returns = []

  for i in pred:
    returns.append(i[0])

  return returns

def getDecreased(pred):
  returns = []

  for i in pred:
    returns.append(i[1])

  return returns    

def addPred(pred,df):
  Increase = []
  Decrease = []

  for i in pred:
    Increase.append(i[0])
    Decrease.append(i[1])

  df.insert(0,"Increase_Prediction", Increase)
  df.insert(1,"Decrease_Prediction", Decrease)

  return df

#testCheck = addPred(pred,test)
#testCheck.insert(0,"CO2_Increase",testLabel1)
#testCheck.insert(1,"CO2_Decrease",testLabel2)


In [220]:
# In this cell we find the Accuracy of our model.

increaseAcc = accuracy_score(testLabel1, getIncreases(pred))
decreaseAcc = accuracy_score(testLabel2, getDecreased(pred))
print(increaseAcc, " ", decreaseAcc)

0.8674242424242424   0.8636363636363636
