In [185]:
import camelot
import pandas as pd
import tabula
import re
import numpy as np

In [2]:
"""
takes title of table from first index of header and replaces headers with integers
"""
def replaceHeader(curr_head, df):
    df.iloc[0, 0] = curr_head[0]
    df.columns = range(df.shape[1])
    return df

In [149]:
"""
combines top 2 rows
"""
def topCombine(table):
    header = table.iloc[0,0]
    table = table.drop(0, axis = 0).reset_index(drop=True)
    table.iloc[0,0]= header
    return table

In [292]:
"""
cleans data when multiple values are batched into the same column
"""
def cleanOverlap(table, column = 0, year_row = 0, text_issue = True, financial = False, drop = False):
    
    
    # get number of columns that are wrong by search for years
    column_err = re.findall("20\d\d-?", table.iloc[year_row,column])

    # create lists of similar size
    lists = []
    
    #get the same num nan as starting year_row
    if text_issue == False:
        for nan in range(year_row):
            lists.append(["" for i in range(len(column_err))])
        
    for i in range(year_row, len(table)): 
        # get errors in column names up to 999,999,999
        numbers = re.findall("-?\d*[,.]?\d*[,.]?\d+%?-?",table.iloc[i,column])

        # make sure they match
        try:
            assert len(column_err) == len(numbers)
        except:
            print((column_err, numbers))
            numbers = numbers[1:]
            
        # remove number from original
        num = " ".join(numbers)
        text = re.sub(num, "", table.iloc[i,column])
        if text_issue:
            lists.append([text] + numbers)
        else:
            lists.append(numbers)
    lists = pd.DataFrame(lists)
    
    if financial:
        table = table.dropna(axis = 0, thresh = 3).reset_index(drop = True)
        
    # append columns back into array in the right order
    table = table.dropna(axis = 1, thresh = 7)
    
    if drop:
        table = table.drop(0, axis = 0).reset_index(drop=True)
        
    if text_issue:
        table = pd.concat([lists, table.iloc[:,1:]], axis = 1) # how to insert new columns in the middle
    else:
        table = pd.concat([table.iloc[:,:column], lists, table.iloc[:,(column + 1):]], axis = 1) # how to insert new columns in the middle
    return table

In [230]:
"""
matches numbers in the text and moves them to the right column
"""
def fillBlanks(table, year_row = 0):
        
    for i in range(year_row, len(table)): 
        # get errors in column names up to 999,999,999
        numbers = re.findall("-?\d*[,.]?\d*[,.]?\d+%?-?",table.iloc[i,0])
        
        if numbers:
            text = re.sub(numbers[0], "", table.iloc[i,0])
            table.iloc[i,0] = text
            table.iloc[i,1] = numbers[0]
            
    return table

In [187]:
np.arange(14, 55, 4)

array([14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54])

In [469]:
#tables = camelot.read_pdf('Energy-Revolution-2015-Full.pdf')

left = 21.97
middle = 21+ 274.96
right = 540

ex_page = 33
final_page = 55

config = 2

table1 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (53.02, 21.97, 53.02 + 344.91, 21+ 274.96), columns = [21+274.96])
t1head = table1.columns


#14, 18, 22, 26, 30, 34, 3
if config == 1:
    table1 = table1.iloc[:, :6]
    table1 = cleanOverlap(table1, drop =True)
    table1 = replaceHeader(t1head, table1)
    table1 = table1.iloc[:,:-1]
if config == 2:
    table1 = table1.iloc[:, :7]
    table1 = cleanOverlap(table1, drop =False)
    table1 = replaceHeader(t1head, table1)
    table1 = table1.iloc[:,:-1]
if config == 3:
    table1 = table1.iloc[:, :8]
    #table1 = cleanOverlap(table1, drop =False)
    table1 = replaceHeader(t1head, table1)
    table1 = table1.iloc[:,:-1]
if config == 4:
    table1 = table1.iloc[:, :8]
    #table1 = cleanOverlap(table1, drop =False)
    table1 = replaceHeader(t1head, table1)
    table1 = table1.iloc[:,:-2]
if config == 5:
    table1 = table1.iloc[:, :8]
    table1 = cleanOverlap(table1, drop =False)
    table1 = replaceHeader(t1head, table1)
    table1 = table1.iloc[:,:-2]
table1

(['2012'], ['2', '0'])
(['2012'], ['2', '0'])
(['2012'], [])
(['2012'], ['2', '0'])


Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.15 middle east: electricity generat...,2012,2020,2025,2030,2040,2050
1,power plants,904,1127,1318,1798,3007,4621
2,hard coal (& non-renewable waste),1,0,0,0,0,0
3,lignite,0,0,0,0,0,0
4,gas,552,721,677,661,367,139
5,of which from H2,0,0,0,0,18,139
6,oil,275,189,132,48,2,0
7,diesel,53,45,25,15,0,0
8,nuclear,2,3,3,3,0,0
9,biomass (& renewable waste),0,5,13,14,15,18


In [470]:
table2 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (410.0, left, 409 + 194, middle))
t2head = table2.columns
if config == 1:
    table2 = cleanOverlap(table2)
    table2 = replaceHeader(t2head, table2)
if config == 2:
    table2 = cleanOverlap(table2)
    table2 = replaceHeader(t2head, table2)
if config == 3:
    table2 = cleanOverlap(table2)
    table2 = replaceHeader(t2head, table2)
if config == 4:
    table2 = cleanOverlap(table2)
    table2 = replaceHeader(t2head, table2)
if config == 5:
    table2 = cleanOverlap(table2)
    table2 = replaceHeader(t2head, table2)
table2

Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.16 middle east: final energy consum...,2012,2020,2025,2030,2040,2050
1,road,5226,6642,6356,6031,4976,4469
2,fossil fuels,4981,6212,5523,4336,1652,0
3,biofuels,0,127,291,377,461,417
4,synfuels,0,0,0,0,150,81
5,natural gas,245,263,252,217,102,0
6,hydrogen,0,0,49,382,1290,1812
7,electricity,0,41,241,719,1472,2240
8,rail,2,20,44,60,88,89
9,fossil fuels,1,1,1,1,0,0


In [471]:
table3 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (596, left, 596 + 183, middle))
t3head = table3.columns
if config == 1:
    table3 = cleanOverlap(table3)
    table3 = replaceHeader(t3head, table3)
if config == 2:
    #table3 = cleanOverlap(table3)
    table3 = replaceHeader(t3head, table3)
if config == 3:
    #table3 = cleanOverlap(table3)
    table3 = replaceHeader(t3head, table3)
if config == 4:
    #table3 = cleanOverlap(table3)
    table3 = replaceHeader(t3head, table3)
if config == 5:
    #table3 = cleanOverlap(table3)
    table3 = replaceHeader(t3head, table3)
table3

Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.17 middle east: heat supply pj/a,2012,2020,2025,2030,2040,2050
1,district heating plants,0,0,0,0,0,0
2,fossil fuels,0,0,0,0,0,0
3,biomass,0,0,0,0,0,0
4,solar collectors,0,0,0,0,0,0
5,geothermal,0,0,0,0,0,0
6,heat from chp1,0,57,107,146,333,631
7,fossil fuels,0,12,17,18,23,0
8,biomass,0,17,31,41,95,196
9,geothermal,0,27,56,81,198,344


In [472]:
table4 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (53.02, 22 + 274.96, 53.02 + 145, 540))
table4

Unnamed: 0,table 13.6.15 middle east: electricity generation twh/a,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,table 13.6.18: middle east: installed capacity gw,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,2012,2020,2025,2030,2040,2050,2012,2020,2025,2030,2040,2050
1,power plants 904,1127,1318,1798,3007,4621,total generation 256,355,463,706,1203,1587
2,hard coal (& non-renewable waste) 1,0,0,0,0,0,fossil 241,265,233,225,196,0
3,lignite 0,0,0,0,0,0,hard coal (& non-renewable waste) 0,0,0,0,0,0
4,gas 552,721,677,661,367,139,lignite 0,0,0,0,0,0
5,of which from H2 0,0,0,0,18,139,gas (w/o h2) 166,203,190,208,195,0
6,oil 275,189,132,48,2,0,oil 62,51,36,14,0,0
7,diesel 53,45,25,15,0,0,diesel 12,12,7,4,0,0
8,nuclear 2,3,3,3,0,0,nuclear 1,0,0,0,0,0
9,biomass (& renewable waste) 0,5,13,14,15,18,"hydrogen (fuel cells, gas power plants, gas ch...",0,0,0,11,82


In [457]:
if config == 1:
    table4 = table4.iloc[:, 5:]
    t4head = table4.columns
    table4 = cleanOverlap(table4)
    table4 = replaceHeader(t4head, table4)
if config == 2:
    table4 = table4.iloc[:, -6:]
    t4head = table4.columns[-6:]
    table4 = cleanOverlap(table4)
    table4 = replaceHeader(t4head, table4)
if config == 3:
    table4 = table4.iloc[:, -7:]
    t4head = table4.columns[-7:]
    #table4 = cleanOverlap(table4)
    table4 = replaceHeader(t4head, table4)
if config == 4:
    table4 = table4.iloc[:, -7:]
    t4head = table4.columns[-7:]
    table4 = cleanOverlap(table4)
    table4 = replaceHeader(t4head, table4)
if config == 5:
    table4 = table4.iloc[:, -6:]
    t4head = table4.columns[-6:]
    table4 = cleanOverlap(table4)
    table4 = replaceHeader(t4head, table4)
table4

(['2012'], ['2', '166'])


Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.10: middle east: installed capacity...,2012,2020,2025,2030,2040,2050
1,total generation,256,352,416,572,923,1392
2,fossil,241,268,230,211,204,222
3,hard coal (& non-renewable waste),0,0,0,0,0,0
4,lignite,0,0,0,0,0,0
5,gas (w/o h2),166,205,187,190,202,220
6,oil,62,51,36,16,1,1
7,diesel,12,12,7,4,0,0
8,nuclear,1,0,0,0,0,0
9,"hydrogen (fuel cells, gas power plants, gas chp)",0,0,0,0,1,2


In [458]:
table5 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (200, middle, 200.5 + 275, right))
t5head = table5.columns

if config == 1:
    table5 = cleanOverlap(table5)
    table5 = replaceHeader(t5head, table5)
if config == 2:
    table5 = cleanOverlap(table5)
    table5 = replaceHeader(t5head, table5)
if config == 3:
    table5 = cleanOverlap(table5)
    table5 = replaceHeader(t5head, table5)
if config == 4:
    table5 = cleanOverlap(table5)
    table5 = replaceHeader(t5head, table5)
if config == 5:
    table5 = cleanOverlap(table5)
    table5 = replaceHeader(t5head, table5)
table5

(['2012'], ['1', '15,306'])


Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.12: middle east: final energy deman...,2012,2020,2025,2030,2040,2050
1,total (incl. non-energy use),18967,21530,21950,22433,22228,21212
2,total energy use1,15306,17981,18119,18320,17918,17203
3,transport,5293,6792,6696,6600,5885,5150
4,oil products,5028,6346,5935,5153,2918,575
5,natural gas,263,288,295,342,333,335
6,biofuels,0,128,310,447,512,558
7,synfuels,0,0,0,0,0,0
8,electricity,1,30,149,511,1248,2085
9,RES electricity,0,4,46,265,968,1940


In [459]:
table6 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (495, middle, 485 + 190, right))

In [460]:
if config == 1:
    t6head = table6.iloc[0]
    table6 = cleanOverlap(table6, year_row = 1)
    table6 = replaceHeader(t6head, table6)
    #table6 = topCombine(table6)
    table6.iloc[22,1] = table6.iloc[22,2]
    table6 = table6.dropna(axis = 1, thresh = 7)
    table6 = replaceHeader(t6head, table6)
if config == 2:
    t6head = table6.iloc[0]
    #table6 = cleanOverlap(table6, year_row = 1)
    table6 = replaceHeader(t6head, table6)
    table6 = topCombine(table6)
    table6.iloc[22,1] = table6.iloc[22,2]
    table6 = table6.dropna(axis = 1, thresh = 7)
    table6 = replaceHeader(t6head, table6)
if config == 3:
    t6head = table6.iloc[0]
    #table6 = cleanOverlap(table6, year_row = 1)
    table6 = replaceHeader(t6head, table6)
    table6 = topCombine(table6)
    table6.iloc[22,1] = table6.iloc[22,2]
    table6 = table6.dropna(axis = 1, thresh = 7)
    table6 = replaceHeader(t6head, table6)
if config == 4:
    t6head = table6.iloc[0]
    #table6 = cleanOverlap(table6, year_row = 1)
    table6 = replaceHeader(t6head, table6)
    table6 = topCombine(table6)
    table6.iloc[22,1] = table6.iloc[22,2]
    table6 = table6.dropna(axis = 1, thresh = 7)
    table6 = replaceHeader(t6head, table6)
if config == 5:
    t6head = table6.iloc[0]
    #table6 = cleanOverlap(table6, year_row = 1)
    table6 = replaceHeader(t6head, table6)
    table6 = topCombine(table6)
    table6.iloc[22,1] = table6.iloc[22,2]
    table6 = table6.dropna(axis = 1, thresh = 7)
    table6 = replaceHeader(t6head, table6)
table6

Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.13: middle east: cO2 emissions mill...,2012,2020,2025,2030,2040,2050
1,condensation power plants,609,647,525,412,250,98
2,hard coal (& non-renewable waste),1,0,0,0,0,0
3,lignite,0,0,0,0,0,0
4,gas,327,406,369,340,245,96
5,oil,210,182,122,52,3,2
6,diesel,71,60,33,20,1,0
7,combined heat and power plants,0,3,3,3,4,6
8,hard coal (& non-renewable waste),0,0,0,0,0,0
9,lignite,0,0,0,0,0,0


In [461]:
table7 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = ex_page, area = (685, middle, 685 + 120, right))

In [462]:
if config == 1:
    t7head = table7.iloc[0]
    table7 = cleanOverlap(table7, year_row = 1, column = 1, text_issue=False)
    table7 = cleanOverlap(table7, year_row = 1, text_issue=True)
    table7 = replaceHeader(t7head, table7)
    #table7 = topCombine(table7)
    
if config == 2:
    t7head = table7.iloc[0]
    table7 = cleanOverlap(table7, year_row = 1, column = 1, text_issue=False)
    table7 = cleanOverlap(table7, year_row = 1, text_issue=True, drop = True)
    table7 = replaceHeader(t7head, table7)
    #table7 = topCombine(table7)
if config == 3:
    t7head = table7.iloc[0]
    table7 = cleanOverlap(table7, year_row = 1, column = 1, text_issue=False)
    table7 = cleanOverlap(table7, year_row = 1, text_issue=True, drop = True)
    table7 = replaceHeader(t7head, table7)
    #table7 = topCombine(table7)
if config == 4:
    t7head = table7.iloc[0]
    table7 = cleanOverlap(table7, year_row = 1, column = 1, text_issue=False)
    table7 = cleanOverlap(table7, year_row = 1, text_issue=True, drop = True)
    table7 = replaceHeader(t7head, table7)
    #table7 = topCombine(table7)
if config == 5:
    t7head = table7.iloc[0]
    table7 = cleanOverlap(table7, year_row = 1, column = 1, text_issue=False)
    table7 = cleanOverlap(table7, year_row = 1, text_issue=True, drop = True)
    table7 = replaceHeader(t7head, table7)
    #table7 = topCombine(table7)
table7


Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.14: middle east: primary energy dem...,2012,2020,2025,2030,2040,2050
1,total,28272,30971,30115,29974,28943,26297
2,fossil,28133,29509,26914,23973,17085,8809
3,hard coal (& non-renewable waste),124,233,405,582,893,1134
4,lignite,1,0,0,0,0,0
5,natural gas,14151,15206,14290,13490,10115,4805
6,crude oil,13856,14070,12219,9901,6078,2870
7,nuclear,20,33,33,33,0,0
8,renewables,119,1430,3168,5969,11858,17488
9,hydro,80,100,117,135,156,181


In [466]:
page = pd.concat([table1, table2, table3, table4, table5, table6, table7], axis=0)

In [467]:
page

Unnamed: 0,0,1,2,3,4,5,6
0,table 13.6.8 middle east: electricity generati...,2012,2020,2025,2030,2040,2050
1,power plants,904,1119,1263,1596,2413,3400
2,hard coal (& non-renewable waste),1,0,0,0,0,0
3,lignite,0,0,0,0,0,0
4,gas,552,730,721,699,541,229
5,of which from H2,0,0,0,0,0,0
6,oil,275,189,132,58,4,2
7,diesel,53,45,25,15,1,0
8,nuclear,2,3,3,3,0,0
9,biomass (& renewable waste),0,5,8,14,15,18


In [175]:
fin_page=14
table22_title = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = fin_page, area = (50, 21.97, 50 + 40, 21+ 274.96))
table22 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = fin_page, area = (62, 21.97, 62 + 286, 21+ 274.96), columns = [21+274.96])
table22 = table22.iloc[:, :6].dropna(axis = 0, thresh = 3)
#replaceHeader(table1.columns, table1)
table22 = cleanOverlap(table22, financial = True)
replaceHeader(table22_title.columns, table22)



(['2012-', '2021-', '2031-'], ['3.98', '1,068.3', '963.8', '3,361.5', '643.0', '4,437.7', '567.1'])


TypeError: expected string or bytes-like object

In [None]:
table23_starting = 6
table23 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = fin_page, area = (55, 300, 55 + 170, 300 + 285))
table23 = table23.iloc[:, table23_starting:].dropna(axis = 0, thresh = 3)
#replaceHeader(table1.columns, table1)
table23 = cleanOverlap(table23, financial = True)
replaceHeader(table23_title.columns[table23_starting:], table23)

In [183]:
table22

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,table 13.1.22: global: investments in electric...,2012-,2021-,2031-,,,,2041-,2012-,unit,annual average,unit 2012- 2021- 2031- 2041-
1,ref,2020,2030,2040,,,,2050,2050,,2012-2050,ref 2020 2030 2040 2050
2,fossil (w/o chp) billion $,1673.6,2205.0,2725.4,,,,2866.8,9470.9,billion $/a,242.8,heat pumps billion $ 131.7 134.3 169.9 184.6
3,nuclear billion $,678.6,740.5,654.9,,,,731.1,2805.1,billion $/a,71.9,deep geothermal billion $ 6.2 0.5 10.1 6.1
4,chp (fossil + renewable) billion $,712.2,651.8,433.5,,,,123.8,1921.3,billion $/a,49.3,solar thermal billion $ 119.2 145.2 216.1 251.4
5,renewables (w/o chp) billion $,2417.8,2274.4,2863.0,,,,2775.8,10331.0,billion $/a,264.9,"biomass billion $ 1,034.4 912.8 427.9 391.7"
6,total billion $,5482.3,5871.7,6676.8,,,,6497.6,24528.3,billion $/a,628.9,"total billion $ 1,291.5 1,192.8 824.0 833.8"
7,conventional (fossil & nuclear) billion $,2996.6,3512.9,3753.8,,,,3680.7,13944.1,billion $/a,357.5,e[r]
8,renewables billion $,2485.7,2358.8,2922.9,,,,2816.9,10584.3,billion $/a,271.4,"heat pumps billion $ 332.5 1,325.4 1,951.0 2,2..."
9,biomass billion $,209.1,229.3,281.1,,,,265.3,984.8,billion $/a,25.3,"deep geothermal billion $ 122.9 252.4 1,133.5 ..."


In [181]:
table24 = tabula.read_pdf('Energy-Revolution-2015-Full.pdf', pages = fin_page, area = (190, middle, 190 + 164, right), columns = [380, 406, 435, 465, 490, 520, 545])
t24header = table24.iloc[8, -9:]
table24 = table24.iloc[8:,-9:].dropna(axis =1, thresh = 7).dropna(axis = 0, thresh = 3).reset_index(drop = True)
#replaceHeader(table1.columns, table1)
#table24 = cleanOverlap(table24, financial = True)
fillBlanks(table24, year_row = 1)
replaceHeader(t24header, table24)
topCombine(table24)



Unnamed: 0,0,1,2,3,4,5,6,7
0,table 13.1.24: global: total employment in the...,2015.0,2020.0,2025.0,2030.0,2020.0,2025.0,2030.0
1,coal,9.76,9.67,8.63,7.7,4.8,3.28,1.97
2,"gas, oil & diesel",3.58,4.16,4.56,4.67,4.0,4.18,
3,nuclear,0.73,0.86,0.83,0.74,0.52,0.52,0.51
4,renewable,14.62,15.41,15.59,14.84,26.91,38.68,41.56
5,total jobs,28.69,30.11,29.62,27.95,36.24,46.65,48.01
6,construction and installation,4.86,5.09,4.6,3.95,8.32,14.59,15.56
7,manufacturing,2.38,2.44,2.23,1.91,5.49,8.87,9.58
8,operations and maintenance,3.23,3.94,4.3,4.27,4.82,6.96,9.0
9,fuel supply (domestic),17.76,18.12,17.93,17.27,17.27,15.97,13.67


In [None]:
financial_page = pd.concat([table22, table23, table24], axis=0)

In [None]:
financial_page

## Storage

In [167]:
#initialize
storage = page.copy()

#### adding

In [None]:
11,12,13, c.0
15,16,17, c.1
19,20,21, c.2
23,24,25, c.2
27,28,29, c.3
31,32, c.4
33 c.5

In [468]:
storage = pd.concat([storage,page], axis=0)

In [None]:
storage = pd.concat([storage,financial_page, axis=0)

In [433]:
storage.to_csv("storage29.csv")