# Data cleaning
## Data structure

In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
def get_category(x, n):
    try:
        return x.split('|')[n]
    except:
        return ''

In [3]:
def change_units(x):
    try:
        return x.split(' ')[0][:2] + 'e+' + x.split(' ')[0][2:] + ' ' + x.split(' ')[1]
    except:
        return ''

## 1. Energy data
### 1.1 Energy Balance
**Datasets:**
- Energy Balance of China (2017)
    - Energy Balance of China (Physical Quantity) -2017
    - Energy Balance of China (Standard Quantity) -2017
- Energy Balance by Region (2017)
    - Energy Balance Table by Region
- Energy Balance by Fuel Type
    - Overall Energy Balance Sheet
    - Coal Balance Sheet
    - Coke Balance Sheet
    - Petroleum Balance Sheet
    - Crude Oil Balance Sheet
    - Fuel Oil Balance Sheet
    - Gasoline  Balance Sheet
    - Kerosene  Balance Sheet
    - Diesel Oil  Balance Sheet
    - LPG Balance Sheet
    - Natural Gas  Balance Sheet
    - Electricity Balance Sheet

#### Read the data
**English**

In [1023]:
df_en = pd.read_excel('../data/Energy data/0Energy Balance by type and by regionƒ‹‘¥∆Ω∫‚±Ì/Energy Balance by region and by energy type-en_final-new.xls')
df_en

Unnamed: 0,Table Name,Year,Region/Country/Province,Indicator,Unit,Value,Unnamed: 6
0,Energy Balance of China (Physical Quantity) -2017,,,,,,
1,,2017.0,China,Input(-)&Output(+)ofTransformation|OtherWashed...,104 tons,14563.7,
2,,2017.0,China,Input(-)&Output(+)ofTransformation|OtherCoking...,104 tons,1077.41,
3,,2017.0,China,Input(-)&Output(+)ofTransformation|Other Gas,108 cu.m,165.197,
4,,2017.0,China,Input(-)&Output(+)ofTransformation|OtherPetrol...,104 tons,3816.85,
...,...,...,...,...,...,...,...
14598,,2017.0,Xinjiang,TotalFinalConsumption|ResidentialConsumption|U...,1010 kJ,15616.9,
14599,,2017.0,Xinjiang,TotalFinalConsumption|ResidentialConsumption|U...,108 cu.m,1.88,
14600,,2017.0,Xinjiang,TotalFinalConsumption|ResidentialConsumption|U...,104 tons,72.6,
14601,,2017.0,Xinjiang,TotalFinalConsumption|ResidentialConsumption|U...,108 kW•h,56.55,


**Chinese**

In [1024]:
df_cn = pd.read_excel('../data/Energy data/0Energy Balance by type and by regionƒ‹‘¥∆Ω∫‚±Ì/Energy Balance by region and by energy type-cn-new.xls')
df_cn

Unnamed: 0,表格名称,国家/省份,年份,指标,单位,数值,Unnamed: 6
0,中国能源平衡表（实物量）-2017,,,,,,
1,,中国,2017.0,加工转换投入(-)产出(+)量|其他洗煤,万吨,14563.7,
2,,中国,2017.0,加工转换投入(-)产出(+)量|其他焦化产品,万吨,1077.41,
3,,中国,2017.0,加工转换投入(-)产出(+)量|其他煤气,亿立方米,165.197,
4,,中国,2017.0,加工转换投入(-)产出(+)量|其他石油制品,万吨,3816.85,
...,...,...,...,...,...,...,...
14598,,新疆,2017.0,终端消费量|生活消费|城镇|热力,万百万千焦,15616.9,
14599,,新疆,2017.0,终端消费量|生活消费|城镇|焦炉煤气,亿立方米,1.88,
14600,,新疆,2017.0,终端消费量|生活消费|城镇|煤合计,万吨,72.6,
14601,,新疆,2017.0,终端消费量|生活消费|城镇|电力,亿千瓦小时,56.55,


**Get table list**

In [1025]:
table_list_en = list(df_en['Table Name'].unique())
table_list_en.remove(np.nan)
table_list_en

['Energy Balance of China (Physical Quantity) -2017',
 'Energy Balance of China (Standard Quantity) -2017',
 'Overall Energy Balance Sheet',
 'Coal Balance Sheet',
 'Coke Balance Sheet',
 'Petroleum Balance Sheet',
 'Crude Oil Balance Sheet',
 'Fuel Oil Balance Sheet',
 'Gasoline Balance Sheet',
 'Kerosene Balance Sheet',
 'Diesel Oil Balance Sheet',
 'LPG Balance Sheet',
 'Natural Gas Balance Sheet',
 'Electricity Balance Sheet',
 'Energy Balance Table by Region']

In [1026]:
table_list_cn = list(df_cn['表格名称'].unique())
table_list_cn.remove(np.nan)
table_list_cn

['中国能源平衡表（实物量）-2017',
 '中国能源平衡表（标准量）-2017',
 '综合能源平衡表',
 '煤炭平衡表',
 '焦炭平衡表',
 '石油平衡表',
 '原油平衡表',
 '燃料油平衡表',
 '汽油平衡表',
 '煤油平衡表',
 '柴油平衡表',
 '液化石油气平衡表',
 '天然气平衡表',
 '电力平衡表',
 '分省能源平衡表']

**Get Country and Provinces**

In [1027]:
df_en.iloc[3876:]['Region/Country/Province'].unique()

array(['China', nan, 'Beijing', 'Tianjin', 'Hebei', 'Shanxi', 'Mongolia',
       'Liaoning', 'Jilin', 'Heilongjiang', 'Shanghai', 'Jiangsu',
       'Zhejiang', 'Anhui', 'Fujian', 'Jiangxi', 'Shandong', 'Henan',
       'Hubei', 'Hunan', 'Guangdong', 'Guangxi', 'Hainan', 'Chongqing',
       'Sichuan', 'Guizhou', 'Yunnan', 'Shaanxi', 'Gansu', 'Qinghai',
       'Ningxia', 'Xinjiang'], dtype=object)

In [1028]:
df_cn.iloc[3876:]['国家/省份'].unique()

array(['中国', nan, ' 北京', ' 天津', ' 河北', ' 山西', ' 内蒙古', ' 辽宁', ' 吉林',
       ' 黑龙江', ' 上海', '  江苏', '  浙江', '  安徽', '  福建', '  江西', '  山东',
       '  河南', '  湖北', '  湖南', '  广东', '  广西', '  海南', '  重庆', '  四川',
       '  贵州', '  云南', '  陕西', '  甘肃', '  青海', '  宁夏', '  新疆'],
      dtype=object)

#### Rearange tables 

In [1029]:
table_index_en = {}
for table in table_list_en:
    table_index_en[table] = df_en[df_en['Table Name'] == table].index[0]
table_index_en

{'Energy Balance of China (Physical Quantity) -2017': 0,
 'Energy Balance of China (Standard Quantity) -2017': 520,
 'Overall Energy Balance Sheet': 1080,
 'Coal Balance Sheet': 1379,
 'Coke Balance Sheet': 1706,
 'Petroleum Balance Sheet': 1972,
 'Crude Oil Balance Sheet': 2016,
 'Fuel Oil Balance Sheet': 2271,
 'Gasoline Balance Sheet': 2556,
 'Kerosene Balance Sheet': 2753,
 'Diesel Oil Balance Sheet': 2953,
 'LPG Balance Sheet': 3232,
 'Natural Gas Balance Sheet': 3428,
 'Electricity Balance Sheet': 3613,
 'Energy Balance Table by Region': 3878}

In [1030]:
table_index_cn = {}
for table in table_list_cn:
    table_index_cn[table] = df_cn[df_cn['表格名称'] == table].index[0]
table_index_cn

{'中国能源平衡表（实物量）-2017': 0,
 '中国能源平衡表（标准量）-2017': 520,
 '综合能源平衡表': 1080,
 '煤炭平衡表': 1379,
 '焦炭平衡表': 1706,
 '石油平衡表': 1972,
 '原油平衡表': 2016,
 '燃料油平衡表': 2271,
 '汽油平衡表': 2556,
 '煤油平衡表': 2753,
 '柴油平衡表': 2953,
 '液化石油气平衡表': 3232,
 '天然气平衡表': 3428,
 '电力平衡表': 3613,
 '分省能源平衡表': 3878}

In [1031]:
table_region_en = {'Energy Balance of China (Physical Quantity) -2017': 'Country',
 'Energy Balance of China (Standard Quantity) -2017': 'Country',
 'Overall Energy Balance Sheet': 'Country',
 'Coal Balance Sheet': 'Country',
 'Coke Balance Sheet': 'Country',
 'Petroleum Balance Sheet': 'Country',
 'Crude Oil Balance Sheet': 'Country',
 'Fuel Oil Balance Sheet': 'Country',
 'Gasoline Balance Sheet': 'Country',
 'Kerosene Balance Sheet': 'Country',
 'Diesel Oil Balance Sheet': 'Country',
 'LPG Balance Sheet': 'Country',
 'Natural Gas Balance Sheet': 'Country',
 'Electricity Balance Sheet': 'Country',
 'Energy Balance Table by Region': 'Province'}

table_region_cn = {'中国能源平衡表（实物量）-2017':'国家',
 '中国能源平衡表（标准量）-2017':'国家',
 '综合能源平衡表':'国家',
 '煤炭平衡表':'国家',
 '焦炭平衡表':'国家',
 '石油平衡表':'国家',
 '原油平衡表':'国家',
 '燃料油平衡表':'国家',
 '汽油平衡表':'国家',
 '煤油平衡表':'国家',
 '柴油平衡表':'国家',
 '液化石油气平衡表':'国家',
 '天然气平衡表':'国家',
 '电力平衡表':'国家',
 '分省能源平衡表':'省份'}

In [1032]:
group = 'Energy'
sub_group = 'Energy Balance'
source = 'Energy Balance by region and by energy type-en_final-new.xls'

indexes = list(table_index_en.values())
df_all_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
for n, table in enumerate(table_list_en):
    if n < 14:
        df_new = df_en.iloc[indexes[n]+1:indexes[n+1]-1].copy()
    else:
        df_new = df_en.iloc[indexes[n]+1:].copy()
        
    df_new = df_new[['Year', 'Region/Country/Province', 'Indicator', 'Unit', 'Value']]
    df_new = df_new.rename(columns={'Region/Country/Province': 'Region'})
    df_new = df_new.astype({'Year': int})
    df_new['Region Type'] = table_region_en[table]
    
    df_new['Category 1'] = df_new['Indicator'].apply(lambda x: get_category(x, 1))
    df_new['Category 2'] = df_new['Indicator'].apply(lambda x: get_category(x, 2))
    df_new['Category 3'] = df_new['Indicator'].apply(lambda x: get_category(x, 3))
    df_new['Category 4'] = df_new['Indicator'].apply(lambda x: get_category(x, 4))
    df_new['Indicator'] = df_new['Indicator'].apply(lambda x: get_category(x, 0))

    df_new['Group'] = group
    df_new['Sub-group'] = sub_group
    df_new['Source'] = source
    df_new['Dataset'] = table
    df_new = df_new[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    df_new['Unit'] = df_new['Unit'].apply(lambda x: change_units(x))
    
    df_all_en = pd.concat([df_all_en,df_new])
    df_all_en.reset_index(drop=True, inplace=True)
    
#Add spaces between words after )
df_all_en['Indicator'] = df_all_en['Indicator'].apply(lambda x: x.replace(')', ') '))
for column in ['Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4']:
    df_all_en[column] = df_all_en[column].apply(lambda x: x.replace('for', ' for'))
    df_all_en[column] = df_all_en[column].apply(lambda x: x.replace('by', ' by'))

#Add spaces between words starting with capital letters
df_all_en['Indicator'] = df_all_en['Indicator'].apply(lambda x: re.sub("([A-Z])", " \\1", x).strip())
df_all_en['Category 1'] = df_all_en['Category 1'].apply(lambda x: re.sub("([A-Z])", " \\1", x).strip())
df_all_en['Category 2'] = df_all_en['Category 2'].apply(lambda x: re.sub("([A-Z])", " \\1", x).strip())
df_all_en['Category 3'] = df_all_en['Category 3'].apply(lambda x: re.sub("([A-Z])", " \\1", x).strip())
df_all_en['Category 4'] = df_all_en['Category 4'].apply(lambda x: re.sub("([A-Z])", " \\1", x).strip())

df_all_en

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (Physical Quantity) -2017,Input(-) & Output(+) of Trans formation,Other Washed Coal,,,,China,Country,2017,14563.7,10e+4 tons
1,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (Physical Quantity) -2017,Input(-) & Output(+) of Trans formation,Other Coking Products,,,,China,Country,2017,1077.41,10e+4 tons
2,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (Physical Quantity) -2017,Input(-) & Output(+) of Trans formation,Other Gas,,,,China,Country,2017,165.197,10e+8 cu.m
3,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (Physical Quantity) -2017,Input(-) & Output(+) of Trans formation,Other Petroleum Products,,,,China,Country,2017,3816.85,10e+4 tons
4,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (Physical Quantity) -2017,Input(-) & Output(+) of Trans formation,Other Energy,,,,China,Country,2017,-764.35,10e+4 tce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14569,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance Table by Region,Total Final Consumption,Residential Consumption,Urban,Heat,,Xinjiang,Province,2017,15616.9,10e+10 kJ
14570,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance Table by Region,Total Final Consumption,Residential Consumption,Urban,Coke Oven Gas,,Xinjiang,Province,2017,1.88,10e+8 cu.m
14571,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance Table by Region,Total Final Consumption,Residential Consumption,Urban,Coal Total,,Xinjiang,Province,2017,72.6,10e+4 tons
14572,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance Table by Region,Total Final Consumption,Residential Consumption,Urban,Electricity,,Xinjiang,Province,2017,56.55,10e+8 kW•h


In [1033]:
group = '能源'
sub_group = '能源平衡'
source = 'Energy Balance by region and by energy type-cn-new.xls'

indexes = list(table_index_cn.values())
df_all_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
for n, table in enumerate(table_list_cn):
    if n < 14:
        df_new = df_cn.iloc[indexes[n]+1:indexes[n+1]-1].copy()
    else:
        df_new = df_cn.iloc[indexes[n]+1:].copy()

    df_new = df_new[['年份', '国家/省份', '指标', '单位', '数值']]
    df_new = df_new.rename(columns={'年份': 'Year', '国家/省份': 'Region', '指标': 'Indicator', '单位': 'Unit', '数值': 'Value'})
    df_new = df_new.astype({'Year': int})
    df_new['Region Type'] = table_region_cn[table]
    
    df_new['Category 1'] = df_new['Indicator'].apply(lambda x: get_category(x, 1))
    df_new['Category 2'] = df_new['Indicator'].apply(lambda x: get_category(x, 2))
    df_new['Category 3'] = df_new['Indicator'].apply(lambda x: get_category(x, 3))
    df_new['Category 4'] = df_new['Indicator'].apply(lambda x: get_category(x, 4))
    df_new['Indicator'] = df_new['Indicator'].apply(lambda x: get_category(x, 0))

    df_new['Group'] = group
    df_new['Sub-group'] = sub_group
    df_new['Source'] = source
    df_new['Dataset'] = table
    df_new = df_new[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_all_cn = pd.concat([df_all_cn,df_new])
    df_all_cn.reset_index(drop=True, inplace=True)
    
df_all_cn

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国能源平衡表（实物量）-2017,加工转换投入(-)产出(+)量,其他洗煤,,,,中国,国家,2017,14563.7,万吨
1,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国能源平衡表（实物量）-2017,加工转换投入(-)产出(+)量,其他焦化产品,,,,中国,国家,2017,1077.41,万吨
2,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国能源平衡表（实物量）-2017,加工转换投入(-)产出(+)量,其他煤气,,,,中国,国家,2017,165.197,亿立方米
3,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国能源平衡表（实物量）-2017,加工转换投入(-)产出(+)量,其他石油制品,,,,中国,国家,2017,3816.85,万吨
4,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国能源平衡表（实物量）-2017,加工转换投入(-)产出(+)量,其他能源,,,,中国,国家,2017,-764.35,万吨标煤
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14569,能源,能源平衡,Energy Balance by region and by energy type-cn...,分省能源平衡表,终端消费量,生活消费,城镇,热力,,新疆,省份,2017,15616.9,万百万千焦
14570,能源,能源平衡,Energy Balance by region and by energy type-cn...,分省能源平衡表,终端消费量,生活消费,城镇,焦炉煤气,,新疆,省份,2017,1.88,亿立方米
14571,能源,能源平衡,Energy Balance by region and by energy type-cn...,分省能源平衡表,终端消费量,生活消费,城镇,煤合计,,新疆,省份,2017,72.6,万吨
14572,能源,能源平衡,Energy Balance by region and by energy type-cn...,分省能源平衡表,终端消费量,生活消费,城镇,电力,,新疆,省份,2017,56.55,亿千瓦小时


#### Merge tables 
**Energy Balance by Fuel Type:**
- Overall Energy Balance Sheet
- Coal Balance Sheet
- Coke Balance Sheet
- Petroleum Balance Sheet
- Crude Oil Balance Sheet
- Fuel Oil Balance Sheet
- Gasoline  Balance Sheet
- Kerosene  Balance Sheet
- Diesel Oil  Balance Sheet
- LPG Balance Sheet
- Natural Gas  Balance Sheet
- Electricity Balance Sheet

In [1038]:
dataset_name = 'Energy Balance by Fuel Type'
index = list(df_all_en[df_all_en['Dataset'] == 'Overall Energy Balance Sheet'].index)

df_all_en.loc[index[0]:index[-1], 'Category 4'] = df_all_en.loc[index[0]:index[-1], 'Category 3']
df_all_en.loc[index[0]:index[-1], 'Category 3'] = df_all_en.loc[index[0]:index[-1], 'Category 2']
df_all_en.loc[index[0]:index[-1], 'Category 2'] = df_all_en.loc[index[0]:index[-1], 'Category 1']
df_all_en.loc[index[0]:index[-1], 'Category 1'] = 'Overall Energy'
df_all_en.loc[index[0]:index[-1], 'Dataset'] = dataset_name

for fuel in ['Coal', 'Coke', 'Petroleum', 'Crude Oil', 'Fuel Oil', 'Gasoline', 'Kerosene', 'Diesel Oil', 'LPG', 'Natural Gas', 'Electricity']:
    index = list(df_all_en[df_all_en['Dataset'] == f'{fuel} Balance Sheet'].index)

    df_all_en.loc[index[0]:index[-1], 'Indicator'] = df_all_en.loc[index[0]:index[-1], 'Category 1']
    df_all_en.loc[index[0]:index[-1], 'Category 1'] = fuel
    df_all_en.loc[index[0]:index[-1], 'Dataset'] = dataset_name

In [1039]:
dataset_name = '能源平衡按燃料类型'
index = list(df_all_cn[df_all_cn['Dataset'] == '综合能源平衡表'].index)

df_all_cn.loc[index[0]:index[-1], 'Category 4'] = df_all_cn.loc[index[0]:index[-1], 'Category 3']
df_all_cn.loc[index[0]:index[-1], 'Category 3'] = df_all_cn.loc[index[0]:index[-1], 'Category 2']
df_all_cn.loc[index[0]:index[-1], 'Category 2'] = df_all_cn.loc[index[0]:index[-1], 'Category 1']
df_all_cn.loc[index[0]:index[-1], 'Category 1'] = '综合能源'
df_all_cn.loc[index[0]:index[-1], 'Dataset'] = dataset_name

for fuel in ['煤炭', '焦炭', '石油', '原油', '燃料油', '汽油', '煤油', '柴油', '液化石油气', '天然气', '电力']:
    index = list(df_all_cn[df_all_cn['Dataset'] == f'{fuel}平衡表'].index)

    df_all_cn.loc[index[0]:index[-1], 'Indicator'] = df_all_cn.loc[index[0]:index[-1], 'Category 1']
    df_all_cn.loc[index[0]:index[-1], 'Category 1'] = fuel
    df_all_cn.loc[index[0]:index[-1], 'Dataset'] = dataset_name

**Energy Balance of China (2017):**
- Energy Balance of China (Physical Quantity) -2017
- Energy Balance of China (Standard Quantity) -2017

In [1040]:
dataset_name = 'Energy Balance of China (2017)'

for quantity in ['Physical Quantity', 'Standard Quantity']:
    index = list(df_all_en[df_all_en['Dataset'] == f'Energy Balance of China ({quantity}) -2017'].index)

    df_all_en.loc[index[0]:index[-1], 'Category 4'] = df_all_en.loc[index[0]:index[-1], 'Category 3']
    df_all_en.loc[index[0]:index[-1], 'Category 3'] = df_all_en.loc[index[0]:index[-1], 'Category 2']
    df_all_en.loc[index[0]:index[-1], 'Category 2'] = df_all_en.loc[index[0]:index[-1], 'Category 1']
    df_all_en.loc[index[0]:index[-1], 'Category 1'] = quantity
    df_all_en.loc[index[0]:index[-1], 'Dataset'] = dataset_name

In [1041]:
dataset_name = '中国的能源平衡 (2017)'

for quantity in ['实物量', '标准量']:
    index = list(df_all_cn[df_all_cn['Dataset'] == f'中国能源平衡表（{quantity}）-2017'].index)

    df_all_cn.loc[index[0]:index[-1], 'Category 4'] = df_all_cn.loc[index[0]:index[-1], 'Category 3']
    df_all_cn.loc[index[0]:index[-1], 'Category 3'] = df_all_cn.loc[index[0]:index[-1], 'Category 2']
    df_all_cn.loc[index[0]:index[-1], 'Category 2'] = df_all_cn.loc[index[0]:index[-1], 'Category 1']
    df_all_cn.loc[index[0]:index[-1], 'Category 1'] = quantity
    df_all_cn.loc[index[0]:index[-1], 'Dataset'] = dataset_name

**Energy Balance by Region (2017):**
- Energy Balance Table by Region

In [1042]:
dataset_name = 'Energy Balance by Region (2017)'
index = list(df_all_en[df_all_en['Dataset'] == f'Energy Balance Table by Region'].index)
df_all_en.loc[index[0]:index[-1], 'Dataset'] = dataset_name

In [1043]:
dataset_name = '区域能源平衡 (2017)'
index = list(df_all_cn[df_all_cn['Dataset'] == f'分省能源平衡表'].index)
df_all_cn.loc[index[0]:index[-1], 'Dataset'] = dataset_name

**Correct indicators for Energy Balance by Fuel Type**

In [1077]:
def change_indicator(x):
    if x == '可供量':
        return '可供消费的能源总量'
    elif x == '消费量':
        return '能源消费总量'
    else:
        return x
df_all_cn['Indicator'] = df_all_cn['Indicator'].apply(lambda x: change_indicator(x))

### Save tables

In [1080]:
df_all_en.to_csv('../data/Clean_data/Energy/0_Energy_Balance/Energy_Balance_en.csv')
df_all_cn.to_csv('../data/Clean_data/Energy/0_Energy_Balance/Energy_Balance_cn.csv')

### 1.2 Energy Consumption
**Datasets:**
- Energy Consumption by Sector
    - Consumption of Coal and Its Main Varieties by Sector
    - Consumption of Coke and Its Main Varieties by Sector
    - Consumption of Crude Oil and Its Main Varieties by Sector
    - Consumption of Diesel Oil and Its Main Varieties by Sector
    - Consumption of Electricity and Its Main Varieties by Sector
    - Consumption of Fuel Oil and Its Main Varieties by Sector
    - Consumption of Gasoline and Its Main Varieties by Sector
    - Consumption of Kerosene and Its Main Varieties by Sector
    - Consumption of Natural Gas and Its Main Varieties by Sector
    - Consumption of Total Energy and Its Main Varieties by Sector
- Energy Consumption by Region (2017)
    - Energy Consumption by Region-2017
- Energy Consumption Per Capita
    - Energy Consumption Per Capita
- Energy Consumption by Industrial Sector (2017)
    - Final Energy Consumption by Industrial Sector (Physical Quantity) -2017
    - Final Energy Consumption by Industrial Sector (Standard Quantity) -2017
- Growth Rate of Energy Consumption Compared With Growth Rate of GDP
    - Growth Rate of Energy Consumption Compared With Growth Rate of GDP
- Residential Energy Consumption Per Capita
    - Residential Energy Consumption Per Capita
- Total Energy Consumption and Composition
    - Total Energy Consumption and Composition.

#### 1.2.1 Energy Consumption by Sector
**Rearange tables**

In [743]:
fuel_types = {"Coal": '1Consumption of Coal and Its Main Varieties by Sector.xls',
             "Coke": '1Consumption of Coke and Its Main Varieties by Sector.xls',
             "Crude Oil": '1Consumption of Crude Oil and Its Main Varieties by Sector.xls',
             "Diesel Oil": '1Consumption of Diesel Oil and Its Main Varieties by Sector.xls',
             "Electricity": '1Consumption of Electricity and Its Main Varieties by Sector.xls',
             "Fuel Oil": '1Consumption of Fuel Oil and Its Main Varieties by Sector.xls',
             "Gasoline": '1Consumption of Gasoline and Its Main Varieties by Sector.xls',
             "Kerosene": '1Consumption of Kerosene and Its Main Varieties by Sector.xls',
             "Natural Gas": '1Consumption of Natural Gas and Its Main Varieties by Sector.xls',
             "Total Energy": '1Consumption of Total Energy and Its Main Varieties by Sector.xls'}

fuel_types_dict = {"Coal": '煤炭',
                   "Coke": '焦炭',
                   "Crude Oil": '原油',
                   "Diesel Oil": '柴油',
                   "Electricity": '电力',
                   "Fuel Oil": '燃料油',
                   "Gasoline": '汽油',
                   "Kerosene": '煤油',
                   "Natural Gas": '天然气',
                   "Total Energy": '能源合计'}

In [744]:
df_all_sector_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_all_sector_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for fuel_type_en in fuel_types.keys():
    fuel_type_cn = fuel_types_dict[fuel_type_en]
    print(fuel_type_en)
    
    df_sector = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{fuel_types[fuel_type_en]}', skiprows=3)
    df_sector.drop(index=0, inplace=True)
    
    # Solve long names going to the next row
    df_sector['行    业'] = df_sector['行    业'].fillna('')
    for index in list(df_sector[df_sector['行    业'] == ''].index):
        df_sector.loc[index-1, 'Sector'] = df_sector.loc[index-1, 'Sector']+df_sector.loc[index, 'Sector']
    df_sector = df_sector[df_sector['行    业'] != ''].reset_index().drop(columns='index')
    
    df_sector_en = df_sector.drop(columns='行    业').rename(columns={'Sector': 'Category'})
    df_sector_cn = df_sector.drop(columns='Sector').rename(columns={'行    业': 'Category'})
    
    #Add category columns
    for column in ['Category 2', 'Category 3', 'Category 4']:
        df_sector_en[column] = ''
        df_sector_cn[column] = ''
        
    category1_en_list = []; category1_cn_list = []
    category2_en_list = []; category2_cn_list = []
    category3_en_list = []; category3_cn_list = []
    
    for n, category_en in enumerate(list(df_sector_en['Category'])):
        if  category_en[0] != ' ':
            category1_en = category_en
            category1_cn = df_sector_cn['Category'].iloc[n]
            
            category1_en_list.append(category1_en); category1_cn_list.append(category1_cn)
            category2_en_list.append('All'); category2_cn_list.append('所有')
            category3_en_list.append(''); category3_cn_list.append('')
        elif  (category_en[:2] == '  ') and category_en[:3] != '   ':
            category2_en = category_en
            category2_cn = df_sector_cn['Category'].iloc[n]
            
            category1_en_list.append(category1_en); category1_cn_list.append(category1_cn)
            category2_en_list.append(category2_en); category2_cn_list.append(category2_cn)
            category3_en_list.append('All'); category3_cn_list.append('所有')   
        elif  (category_en[:3] == '   ') and category_en[:4] != '    ':
            category3_en = category_en
            category3_cn = df_sector_cn['Category'].iloc[n]
            
            category1_en_list.append(category1_en); category1_cn_list.append(category1_cn)
            category2_en_list.append(category2_en); category2_cn_list.append(category2_cn)
            category3_en_list.append(category3_en); category3_cn_list.append(category3_cn)  
            
    df_sector_en['Category 2'] = category1_en_list; df_sector_cn['Category 2'] = category1_cn_list
    df_sector_en['Category 3'] = category2_en_list; df_sector_cn['Category 3'] = category2_cn_list
    df_sector_en['Category 4'] = category3_en_list; df_sector_cn['Category 4'] = category3_cn_list
    
    df_sector_en.drop(columns='Category', inplace = True)
    df_sector_cn.drop(columns='Category', inplace = True)
    
    #Unpivot year columns
    df_sector_en = df_sector_en.melt(list(df_sector_en.columns)[12:], var_name='Year', value_name='Value')
    df_sector_cn = df_sector_cn.melt(list(df_sector_cn.columns)[12:], var_name='Year', value_name='Value')
    
    #Add units
    df_sector_en['Unit'] = '10e+4 tons'
    df_sector_cn['Unit'] = '万吨'
    
    #Change value type
    df_sector_en.fillna(-1, inplace=True)
    df_sector_cn.fillna(-1, inplace=True)
    df_sector_en = df_sector_en.astype({'Value': int})
    df_sector_cn = df_sector_cn.astype({'Value': int})
    df_sector_en.replace(-1, np.nan, inplace=True)
    
    #Add additional columns
    df_sector_en['Group'] = 'Energy'
    df_sector_en['Sub-group'] = 'Energy Consumption'
    df_sector_en['Source'] = file_name
    df_sector_en['Dataset'] = 'Energy Consumption by Sector'
    df_sector_en['Indicator'] = 'Energy Consumption'
    df_sector_en['Category 1'] = fuel_type_en
    df_sector_en['Region'] = 'China'
    df_sector_en['Region Type'] = 'Country'
    df_sector_en = df_sector_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_sector_cn['Group'] = '能源'
    df_sector_cn['Sub-group'] = '能源消耗'
    df_sector_cn['Source'] = file_name
    df_sector_cn['Dataset'] = '部门能源消耗'
    df_sector_cn['Indicator'] = '能源消耗'
    df_sector_cn['Category 1'] = fuel_type_cn
    df_sector_cn['Region'] = '中国'
    df_sector_cn['Region Type'] = '国家'
    df_sector_cn = df_sector_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    # Concatenate DataFrames by fuel type
    df_all_sector_en = pd.concat([df_all_sector_en, df_sector_en])
    df_all_sector_cn = pd.concat([df_all_sector_cn, df_sector_cn])

Coal
Coke
Crude Oil
Diesel Oil
Electricity
Fuel Oil
Gasoline
Kerosene
Natural Gas
Total Energy


In [745]:
df_all_sector_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,Loss Caused by Natural Disasters by region.xls,Energy Consumption by Sector,Energy Consumption,Coal,Total Consumption ...,All,,China,Country,1995,137676.0,10e+4 tons


In [746]:
df_all_sector_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,Loss Caused by Natural Disasters by region.xls,部门能源消耗,能源消耗,煤炭,消 费 总 量,所有,,中国,国家,1995,137676,万吨


#### 1.2.2 Energy Consumption by Region (2017)
**Rearange tables**

In [747]:
file_name = '1Energy Consumption by Region-2017.xls'
df_region = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{file_name}', skiprows=3)

## English
df_region_en = df_region.drop(columns='地  区')
df_region_en.drop(index=[0], inplace=True)
df_region_en['Region'] = df_region_en['Region'].fillna('')
df_region_en.columns = [x+ ' ' + y for x, y in zip(list(df_region_en.iloc[0]), list(df_region_en.iloc[1]))]
df_region_en.rename(columns={' ': 'Region'}, inplace=True)
df_region_en.drop(index=[1,2], inplace=True)
df_region_en.reset_index(inplace=True)
df_region_en.drop(columns='index', inplace=True)

#Unpivot year columns
df_region_en = df_region_en.melt(list(df_region_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_region_en['Unit'] = df_region_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_region_en['Category 1'] = df_region_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Change unit format
df_region_en['Unit'] = df_region_en['Unit'].apply(lambda x: change_units(x))

#Add additional columns
df_region_en['Group'] = 'Energy'
df_region_en['Sub-group'] = 'Energy Consumption'
df_region_en['Source'] = file_name
df_region_en['Dataset'] = 'Energy Consumption by Region (2017)'
df_region_en['Indicator'] = 'Energy Consumption'
df_region_en['Category 2'] = ''
df_region_en['Category 3'] = ''
df_region_en['Category 4'] = ''
df_region_en['Year'] = 2017
df_region_en['Region Type'] = 'Province'
df_region_en = df_region_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_region_cn = df_region.drop(columns='Region')
df_region_cn.drop(index=[1,2], inplace=True)
df_region_cn['地  区'] = df_region_cn['地  区'].fillna('')
df_region_cn.columns = [x+ ' ' + y for x, y in zip(list(df_region_cn.columns), list(df_region_cn.iloc[0]))]
df_region_cn.drop(index=[0], inplace=True)
df_region_cn.rename(columns={'地  区 ':'Region'}, inplace=True)
df_region_cn.reset_index(inplace=True)
df_region_cn.drop(columns='index', inplace=True)

#Unpivot year columns
df_region_cn = df_region_cn.melt(list(df_region_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_region_cn['Unit'] = df_region_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_region_cn['Category 1'] = df_region_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_region_cn['Group'] = '能源'
df_region_cn['Sub-group'] = '能源消耗'
df_region_cn['Source'] = file_name
df_region_cn['Dataset'] = '各地区能源消耗 (2017)'
df_region_cn['Indicator'] = '能源消耗'
df_region_cn['Category 2'] = ''
df_region_cn['Category 3'] = ''
df_region_cn['Category 4'] = ''
df_region_cn['Year'] = 2017
df_region_cn['Region Type'] = '省份'
df_region_cn = df_region_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [748]:
df_region_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Energy Consumption by Region-2017.xls,Energy Consumption by Region (2017),Energy Consumption,Total Energy,,,,Beijing,Province,2017,7132.84,10e+4 tce


In [749]:
df_region_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Energy Consumption by Region-2017.xls,各地区能源消耗 (2017),能源消耗,能源合计,,,,北 京,省份,2017,7132.84,万吨标煤


#### 1.2.3 Energy Consumption Per Capita
**Rearange tables**

In [750]:
file_name = '1Energy Consumption Per Capita.xls'
df_capita = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{file_name}', skiprows=5)

## English
df_capita_en = df_capita.drop(index=[0])
df_capita_en['年  份'] = df_capita_en['年  份'].fillna('')
df_capita_en.columns = [x+ ' ' + y for x, y in zip(list(df_capita_en.iloc[0]), list(df_capita_en.iloc[1]))]
df_capita_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_capita_en.drop(index=[1,2], inplace=True)
df_capita_en.reset_index(inplace=True)
df_capita_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_capita_en = df_capita_en.melt(list(df_capita_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_capita_en['Unit'] = df_capita_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_capita_en['Category 1'] = df_capita_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_capita_en['Group'] = 'Energy'
df_capita_en['Sub-group'] = 'Energy Consumption'
df_capita_en['Source'] = file_name
df_capita_en['Dataset'] = 'Energy Consumption Per Capita'
df_capita_en['Indicator'] = 'Energy Consumption'
df_capita_en['Category 2'] = ''
df_capita_en['Category 3'] = ''
df_capita_en['Category 4'] = ''
df_capita_en['Region'] = 'China'
df_capita_en['Region Type'] = 'Country'
df_capita_en = df_capita_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_capita_cn = df_capita.drop(index=[1,2])
df_capita_cn['年  份'] = df_capita_cn['年  份'].fillna('')
df_capita_cn.columns = [x+ ' ' + y for x, y in zip(list(df_capita_cn.columns), list(df_capita_cn.iloc[0]))]
df_capita_cn.rename(columns={'年  份 ': 'Year'}, inplace=True)
df_capita_cn.drop(index=[0], inplace=True)
df_capita_cn.reset_index(inplace=True)
df_capita_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_capita_cn = df_capita_cn.melt(list(df_capita_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_capita_cn['Unit'] = df_capita_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_capita_cn['Category 1'] = df_capita_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_capita_cn['Group'] = '能源'
df_capita_cn['Sub-group'] = '能源消耗'
df_capita_cn['Source'] = file_name
df_capita_cn['Dataset'] = '人均能源消费量'
df_capita_cn['Indicator'] = '能源消耗'
df_capita_cn['Category 2'] = ''
df_capita_cn['Category 3'] = ''
df_capita_cn['Category 4'] = ''
df_capita_cn['Region'] = '中国'
df_capita_cn['Region Type'] = '国家'
df_capita_cn = df_capita_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [751]:
df_capita_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Energy Consumption Per Capita.xls,Energy Consumption Per Capita,Energy Consumption,Total Energy,,,,China,Country,1980,614.277,kgce


In [752]:
df_capita_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Energy Consumption Per Capita.xls,人均能源消费量,能源消耗,能源总量,,,,中国,国家,1980,614.277,千克标准煤


#### 1.2.4 Energy Consumption by Industrial Sector (2017)
**Rearange tables**

In [753]:
quantity_types = {'Physical Quantity': '1Final Energy Consumption by Industrial Sector (Physical Quantity) -2017.xls',
                  "Standard Quantity": '1Final Energy Consumption by Industrial Sector (Standard Quantity) -2017.xls'}

quantity_types_dict = {"Physical Quantity": '实物量',
                       "Standard Quantity": '标准量'}

In [754]:
df_all_industrial_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_all_industrial_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for quantity_en in quantity_types.keys():
    quantity_cn = quantity_types_dict[quantity_en]
    print(quantity_en)
    
    df_industrial = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{quantity_types[quantity_en]}', skiprows=3)
    
    ## English
    df_industrial_en = df_industrial.drop(index=[0])
    df_industrial_en = df_industrial_en.drop(columns='行    业')
    df_industrial_en['Item'] = df_industrial_en['Item'].fillna('')
    df_industrial_en.columns = [x+ ' ' + y for x, y in zip(list(df_industrial_en.iloc[0]), list(df_industrial_en.iloc[1]))]
    df_industrial_en.rename(columns={' ': 'Category'}, inplace=True)
    df_industrial_en.drop(index=[1,2], inplace=True)
    df_industrial_en.reset_index(inplace=True)
    df_industrial_en.drop(columns='index', inplace=True)
    
    ## Chinese
    df_industrial_cn = df_industrial.drop(index=[1,2])
    df_industrial_cn = df_industrial_cn.drop(columns='Item')
    df_industrial_cn['行    业'] = df_industrial_cn['行    业'].fillna('')
    df_industrial_cn.columns = [x+ ' ' + y for x, y in zip(list(df_industrial_cn.columns), list(df_industrial_cn.iloc[0]))]
    df_industrial_cn.rename(columns={'行    业 ': 'Category'}, inplace=True)
    df_industrial_cn.drop(index=[0], inplace=True)
    df_industrial_cn.reset_index(inplace=True)
    df_industrial_cn.drop(columns='index', inplace=True)
    
    # Solve long names going to the next row
    for index in list(df_industrial_cn[df_industrial_cn['Category'] == ''].index):
        df_industrial_en.loc[index-1, 'Category'] = df_industrial_en.loc[index-1, 'Category']+df_industrial_en.loc[index, 'Category']
        df_industrial_en.loc[index, 'Category'] = ''
        
    df_industrial_en = df_industrial_en[df_industrial_cn['Category'] != ''].reset_index().drop(columns='index')
    df_industrial_cn = df_industrial_cn[df_industrial_cn['Category'] != ''].reset_index().drop(columns='index')
    
    #Add category columns
    for column in ['Category 3', 'Category 4']:
        df_industrial_en[column] = ''
        df_industrial_cn[column] = ''
        
    category2_en_list = []; category2_cn_list = []
    category3_en_list = []; category3_cn_list = []
    
    for n, category_en in enumerate(list(df_industrial_en['Category'])):
        if  category_en[:2] != '  ':
            category2_en = category_en
            category2_cn = df_industrial_cn['Category'].iloc[n]
            
            category2_en_list.append(category2_en); category2_cn_list.append(category2_cn)
            category3_en_list.append('All'); category3_cn_list.append('所有')
    
        elif  (category_en[:3] == '   ') and category_en[:4] != '   ':
            category3_en = category_en
            category3_cn = df_industrial_cn['Category'].iloc[n]
            
            category2_en_list.append(category2_en); category2_cn_list.append(category2_cn)
            category3_en_list.append(category3_en); category3_cn_list.append(category3_cn)
            
    df_industrial_en['Category 3'] = category2_en_list; df_industrial_cn['Category 3'] = category2_cn_list
    df_industrial_en['Category 4'] = category3_en_list; df_industrial_cn['Category 4'] = category3_cn_list
    
    df_industrial_en.drop(columns='Category', inplace = True)
    df_industrial_cn.drop(columns='Category', inplace = True)
    
    #Unpivot fuel type columns
    df_industrial_en = df_industrial_en.melt(list(df_industrial_en.columns)[-2:], var_name='Category 2', value_name='Value')
    df_industrial_cn = df_industrial_cn.melt(list(df_industrial_cn.columns)[-2:], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_industrial_en['Unit'] = df_industrial_en['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_industrial_en['Category 2'] = df_industrial_en['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    df_industrial_cn['Unit'] = df_industrial_cn['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_industrial_cn['Category 2'] = df_industrial_cn['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Change unit format
    df_industrial_en['Unit'] = df_industrial_en['Unit'].apply(lambda x: change_units(x))
    
    #Add additional columns
    df_industrial_en['Group'] = 'Energy'
    df_industrial_en['Sub-group'] = 'Energy Consumption'
    df_industrial_en['Source'] = quantity_types[quantity_en]
    df_industrial_en['Dataset'] = 'Energy Consumption by Industrial Sector (2017)'
    df_industrial_en['Indicator'] = 'Energy Consumption'
    df_industrial_en['Category 1'] = quantity_en
    df_industrial_en['Region'] = 'China'
    df_industrial_en['Region Type'] = 'Country'
    df_industrial_en['Year'] = 2017
    df_industrial_en = df_industrial_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_industrial_cn['Group'] = '能源'
    df_industrial_cn['Sub-group'] = '能源消耗'
    df_industrial_cn['Source'] = quantity_types[quantity_en]
    df_industrial_cn['Dataset'] = '工业分行业终端能源消费量 (2017)'
    df_industrial_cn['Indicator'] = '能源消耗'
    df_industrial_cn['Category 1'] = quantity_cn
    df_industrial_cn['Region'] = '中国'
    df_industrial_cn['Region Type'] = '国家'
    df_industrial_cn['Year'] = '2017'
    df_industrial_cn = df_industrial_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_all_industrial_en = pd.concat([df_all_industrial_en, df_industrial_en])
    df_all_industrial_cn = pd.concat([df_all_industrial_cn, df_industrial_cn])

Physical Quantity
Standard Quantity


In [755]:
df_all_industrial_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Final Energy Consumption by Industrial Sector...,Energy Consumption by Industrial Sector (2017),Energy Consumption,Physical Quantity,Coal Total,Industry ...,All,China,Country,2017,71438,10e+4 tons


In [756]:
df_all_industrial_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Final Energy Consumption by Industrial Sector...,工业分行业终端能源消费量 (2017),能源消耗,实物量,煤合计,工业,所有,中国,国家,2017,71438,万吨


#### 1.2.5 Growth Rate of Energy Consumption Compared With Growth Rate of GDP
**Rearange tables**

In [757]:
def split_indicator(x):
    if x[-1] == '度':
        return '增长速度'
    elif x[-1] == '数':
        return '弹性系数'

In [758]:
def split_category(x):
    if x[-1] == '度':
        return x.split('增长速度')[0]
    elif x[-1] == '数':
        return x.split('弹性系数')[0]

In [759]:
file_name = '1Growth Rate of Energy Consumption Compared With Growth Rate of GDP.xls'
df_growth = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{file_name}', skiprows=3)

## English
df_growth_en = df_growth.drop(index=[0])
df_growth_en['年 份'] = df_growth_en['年 份'].fillna('')
df_growth_en.columns = [x+ ' ' + y for x, y in zip(list(df_growth_en.iloc[0]), list(df_growth_en.iloc[1]))]
df_growth_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_growth_en.drop(index=[1,2], inplace=True)
df_growth_en.reset_index(inplace=True)
df_growth_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_growth_en = df_growth_en.melt(list(df_growth_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_growth_en['Unit'] = df_growth_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_growth_en['Category 1'] = df_growth_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Separate Indicator from Category 1
df_growth_en['Indicator'] = df_growth_en['Category 1'].apply(lambda x: x.split('of')[0][:-1])
df_growth_en['Category 1'] = df_growth_en['Category 1'].apply(lambda x: x.split('of')[1][1:])

#Add additional columns
df_growth_en['Group'] = 'Energy'
df_growth_en['Sub-group'] = 'Energy Consumption'
df_growth_en['Source'] = file_name
df_growth_en['Dataset'] = 'Growth Rate of Energy Consumption Compared With Growth Rate of GDP'
df_growth_en['Category 2'] = ''
df_growth_en['Category 3'] = ''
df_growth_en['Category 4'] = ''
df_growth_en['Region'] = 'China'
df_growth_en['Region Type'] = 'Country'
df_growth_en = df_growth_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_growth_cn = df_growth.drop(index=[1,2])
df_growth_cn['年 份'] = df_growth_cn['年 份'].fillna('')
df_growth_cn.columns = [x+ ' ' + y for x, y in zip(list(df_growth_cn.columns), list(df_growth_cn.iloc[0]))]
df_growth_cn.rename(columns={'年 份 ': 'Year'}, inplace=True)
df_growth_cn.drop(index=[0], inplace=True)
df_growth_cn.reset_index(inplace=True)
df_growth_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_growth_cn = df_growth_cn.melt(list(df_growth_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_growth_cn['Unit'] = df_growth_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_growth_cn['Category 1'] = df_growth_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Separate Indicator from Category 1
df_growth_cn['Indicator'] = df_growth_cn['Category 1'].apply(lambda x: split_indicator(x))
df_growth_cn['Category 1'] = df_growth_cn['Category 1'].apply(lambda x: split_category(x))

#Add additional columns
df_growth_cn['Group'] = '能源'
df_growth_cn['Sub-group'] = '能源消耗'
df_growth_cn['Source'] = file_name
df_growth_cn['Dataset'] = '能源消费与国内生产总值增长速度'
df_growth_cn['Category 2'] = ''
df_growth_cn['Category 3'] = ''
df_growth_cn['Category 4'] = ''
df_growth_cn['Region'] = '中国'
df_growth_cn['Region Type'] = '国家'
df_growth_cn = df_growth_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [760]:
df_growth_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Growth Rate of Energy Consumption Compared Wi...,Growth Rate of Energy Consumption Compared Wit...,Growth Rate,GDP,,,,China,Country,1980,7.8,%


In [761]:
df_growth_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Growth Rate of Energy Consumption Compared Wi...,能源消费与国内生产总值增长速度,增长速度,国内生产总值,,,,中国,国家,1980,7.8,%


#### 1.2.6 Residential Energy Consumption Per Capita
**Rearange tables**

In [762]:
file_name = '1Residential Energy Consumption Per Capita.xls'
df_residential = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{file_name}', skiprows=3)

## English
df_residential_en = df_residential.drop(index=[0])
df_residential_en['年  份'] = df_residential_en['年  份'].fillna('')
df_residential_en.columns = [x+ ' ' + y for x, y in zip(list(df_residential_en.iloc[0]), list(df_residential_en.iloc[1]))]
df_residential_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_residential_en.drop(index=[1,2], inplace=True)
df_residential_en.reset_index(inplace=True)
df_residential_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_residential_en = df_residential_en.melt(list(df_residential_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_residential_en['Unit'] = df_residential_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_residential_en['Category 1'] = df_residential_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_residential_en['Group'] = 'Energy'
df_residential_en['Sub-group'] = 'Energy Consumption'
df_residential_en['Source'] = file_name
df_residential_en['Dataset'] = 'Residential Energy Consumption Per Capita'
df_residential_en['Indicator'] = 'Energy Consumption'
df_residential_en['Category 2'] = ''
df_residential_en['Category 3'] = ''
df_residential_en['Category 4'] = ''
df_residential_en['Region'] = 'China'
df_residential_en['Region Type'] = 'Country'
df_residential_en = df_residential_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_residential_cn = df_residential.drop(index=[1,2])
df_residential_cn['年  份'] = df_residential_cn['年  份'].fillna('')
df_residential_cn.columns = [x+ ' ' + y for x, y in zip(list(df_residential_cn.columns), list(df_residential_cn.iloc[0]))]
df_residential_cn.rename(columns={'年  份 ': 'Year'}, inplace=True)
df_residential_cn.drop(index=[0], inplace=True)
df_residential_cn.reset_index(inplace=True)
df_residential_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_residential_cn = df_residential_cn.melt(list(df_residential_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_residential_cn['Unit'] = df_residential_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_residential_cn['Category 1'] = df_residential_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_residential_cn['Group'] = '能源'
df_residential_cn['Sub-group'] = '能源消耗'
df_residential_cn['Source'] = quantity_types[quantity_en]
df_residential_cn['Dataset'] = '人均生活用能量'
df_residential_cn['Indicator'] = '能源消耗'
df_residential_cn['Category 2'] = ''
df_residential_cn['Category 3'] = ''
df_residential_cn['Category 4'] = ''
df_residential_cn['Region'] = '中国'
df_residential_cn['Region Type'] = '国家'
df_residential_cn = df_residential_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [763]:
df_residential_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Residential Energy Consumption Per Capita.xls,Residential Energy Consumption Per Capita,Energy Consumption,Annual Average,,,,China,Country,1980,112.256,kgce


In [764]:
df_residential_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Final Energy Consumption by Industrial Sector...,人均生活用能量,能源消耗,全国人均,,,,中国,国家,1980,112.256,千克标准煤


#### 1.2.7 Total Energy Consumption and Composition
**Rearange tables**

In [765]:
file_name = '1Total Energy Consumption and Composition.xls'
df_composition = pd.read_excel(f'../data/Energy data/1Energy Consumptionƒ‹‘¥œ˚∑—/{file_name}', skiprows=3)

df_calorific_value_tot = df_composition[['Unnamed: 0', '电热当量计算法  calorific value calculation']]
df_calorific_value_per = df_composition[['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']]

df_coal_equivalent_tot = df_composition[['Unnamed: 0', '发电煤耗计算法  coal equivalent calculation']]
df_coal_equivalent_per = df_composition[['Unnamed: 0', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']]


category_1_df = {'calorific value calculation': {'Total': df_calorific_value_tot, 'Percentage': df_calorific_value_per},
                 'coal equivalent calculation': {'Total': df_calorific_value_tot, 'Percentage': df_calorific_value_per}}

category_1_dic = {'calorific value calculation': '电热当量计算法',
                 'coal equivalent calculation': '发电煤耗计算法'}

category_2_dic = {'Total Energy Consumption (104 tce)': '能源消费总量 (万吨标准煤)',
                 'As percentage of primary energy consumption (%)': '占能源消费总量的比重 (%)'}

### Total Energy Consumption
df_total_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_total_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1 in category_1_df.keys():
    df_tot = category_1_df[category_1]['Total']
    
    ## English
    df_tot_en = df_tot.drop(index=[0, 1, 4, 5, 6, 7])
    df_tot_en['Unnamed: 0'] = df_tot_en['Unnamed: 0'].fillna('')
    df_tot_en.columns = [x+ ' ' + y for x, y in zip(list(df_tot_en.iloc[0]), list(df_tot_en.iloc[1]))]
    df_tot_en.rename(columns={' ': 'Year'}, inplace=True)
    df_tot_en.drop(index=[2,3], inplace=True)
    df_tot_en.reset_index(inplace=True)
    df_tot_en.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_tot_en = df_tot_en.melt(list(df_tot_en.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_tot_en['Unit'] = df_tot_en['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_tot_en['Category 2'] = df_tot_en['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Change unit format
    df_tot_en['Unit'] = df_tot_en['Unit'].apply(lambda x: change_units(x))
    
    #Add additional columns
    df_tot_en['Group'] = 'Energy'
    df_tot_en['Sub-group'] = 'Energy Consumption'
    df_tot_en['Source'] = file_name
    df_tot_en['Dataset'] = 'Total Energy Consumption and Composition'
    df_tot_en['Indicator'] = 'Energy Consumption'
    df_tot_en['Category 1'] = category_1
    df_tot_en['Category 3'] = ''
    df_tot_en['Category 4'] = ''
    df_tot_en['Region'] = 'China'
    df_tot_en['Region Type'] = 'Country'
    df_tot_en = df_tot_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_total_en = pd.concat([df_total_en, df_tot_en])
    
    ## Chinese
    df_tot_cn = df_tot.drop(index=[2, 3, 4, 5, 6, 7])
    df_tot_cn['Unnamed: 0'] = df_tot_cn['Unnamed: 0'].fillna('')
    df_tot_cn.columns = [x+ ' ' + y for x, y in zip(list(df_tot_cn.iloc[0]), list(df_tot_cn.iloc[1]))]
    df_tot_cn.rename(columns={' 年  份': 'Year'}, inplace=True)
    df_tot_cn.drop(index=[0,1], inplace=True)
    df_tot_cn.reset_index(inplace=True)
    df_tot_cn.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_tot_cn = df_tot_cn.melt(list(df_tot_cn.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_tot_cn['Unit'] = df_tot_cn['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_tot_cn['Category 2'] = df_tot_cn['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Add additional columns
    df_tot_cn['Group'] = '能源'
    df_tot_cn['Sub-group'] = '能源消耗'
    df_tot_cn['Source'] = file_name
    df_tot_cn['Dataset'] = '能源消费总量和构成'
    df_tot_cn['Indicator'] = '能源消耗'
    df_tot_cn['Category 1'] = category_1_dic[category_1]
    df_tot_cn['Category 3'] = ''
    df_tot_cn['Category 4'] = ''
    df_tot_cn['Region'] = '中国'
    df_tot_cn['Region Type'] = '国家'
    df_tot_cn = df_tot_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_total_cn = pd.concat([df_total_cn, df_tot_cn])
    
### Percentage of Primary Energy Consumption
df_percentage_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_percentage_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1 in category_1_df.keys():
    df_per = category_1_df[category_1]['Percentage']

    ## English
    df_per_en = df_per.drop(index=[0, 1, 2, 3, 4, 6])
    df_per_en.fillna('', inplace=True)
    df_per_en.columns = [x+ ' ' + y for x, y in zip(list(df_per_en.iloc[0]), list(df_per_en.iloc[1]))]
    df_per_en.rename(columns={' ': 'Year'}, inplace=True)
    df_per_en.drop(index=[5,7], inplace=True)
    df_per_en.reset_index(inplace=True)
    df_per_en.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_per_en = df_per_en.melt(list(df_per_en.columns)[0], var_name='Category 3', value_name='Value')
    
    #Rearange categories
    category_4_en = {'Coal ': '', 'Petroleum ': '', 'Natural Gas ': '',
           'Primary Electricty and Other Energy ': 'All', ' Hydro Power': 'Hydro Power',
           ' Nuclear Power': 'Nuclear Power'}
    
    category_3_en = {'Coal ': 'Coal', 'Petroleum ': 'Petroleum', 'Natural Gas ': 'Natural Gas',
           'Primary Electricty and Other Energy ': 'Primary Electricty and Other Energy', ' Hydro Power': 'Primary Electricty and Other Energy',
           ' Nuclear Power': 'Primary Electricty and Other Energy'}
    
    df_per_en['Category 4'] = df_per_en['Category 3'].apply(lambda x: category_4_en[x])
    df_per_en['Category 3'] = df_per_en['Category 3'].apply(lambda x: category_3_en[x])
    
    #Add additional columns
    df_per_en['Group'] = 'Energy'
    df_per_en['Sub-group'] = 'Energy Consumption'
    df_per_en['Source'] = file_name
    df_per_en['Dataset'] = 'Total Energy Consumption and Composition'
    df_per_en['Indicator'] = 'Energy Consumption'
    df_per_en['Category 1'] = category_1
    df_per_en['Category 2'] = 'percentage of primary energy consumption'
    df_per_en['Region'] = 'China'
    df_per_en['Region Type'] = 'Country'
    df_per_en['Unit'] = '%'
    df_per_en = df_per_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
        
    df_percentage_en = pd.concat([df_percentage_en, df_per_en])
    
    ## Chinese
    df_per_cn = df_per.drop(index=[0, 1, 2, 3, 5, 7])
    df_per_cn.fillna('', inplace=True)
    df_per_cn.columns = [x+ ' ' + y for x, y in zip(list(df_per_cn.iloc[0]), list(df_per_cn.iloc[1]))]
    df_per_cn.rename(columns={' Year': 'Year'}, inplace=True)
    df_per_cn.drop(index=[4,6], inplace=True)
    df_per_cn.reset_index(inplace=True)
    df_per_cn.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_per_cn = df_per_cn.melt(list(df_per_cn.columns)[0], var_name='Category 3', value_name='Value')
    
    #Rearange categories
    category_4_cn = {'煤  炭 ': '', '石  油 ': '', '天然气 ': '', '一次电力及 其他能源 ':'所有', ' 水电': '水电', ' 核电': '核电'}
    category_3_cn = {'煤  炭 ': '煤炭', '石  油 ': '石油', '天然气 ': '天然气', '一次电力及 其他能源 ':'一次电力及 其他能源', ' 水电': '一次电力及 其他能源', ' 核电': '一次电力及 其他能源'}
    
    df_per_cn['Category 4'] = df_per_cn['Category 3'].apply(lambda x: category_4_cn[x])
    df_per_cn['Category 3'] = df_per_cn['Category 3'].apply(lambda x: category_3_cn[x])
    
    #Add additional columns
    df_per_cn['Group'] = '能源'
    df_per_cn['Sub-group'] = '能源消耗'
    df_per_cn['Source'] = file_name
    df_per_cn['Dataset'] = '能源消费总量和构成'
    df_per_cn['Indicator'] = '能源消耗'
    df_per_cn['Category 1'] = category_1_dic[category_1]
    df_per_cn['Category 2'] = '占能源消费总量的比重'
    df_per_cn['Region'] = '中国'
    df_per_cn['Region Type'] = '国家'
    df_per_cn['Unit'] = '%'
    df_per_cn = df_per_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_percentage_cn = pd.concat([df_percentage_cn, df_per_cn])
    
df_composition_en = pd.concat([df_total_en, df_percentage_en])
df_composition_cn = pd.concat([df_total_cn, df_percentage_cn])

In [766]:
df_composition_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,1Total Energy Consumption and Composition.xls,Total Energy Consumption and Composition,Energy Consumption,calorific value calculation,Total Energy Consumption,,,China,Country,1980,58587,10e+4 tce


In [767]:
df_composition_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,1Total Energy Consumption and Composition.xls,能源消费总量和构成,能源消耗,电热当量计算法,能源消费总量,,,中国,国家,1980,58587,万吨标准煤


#### **Concatenate all datasets**

In [768]:
df_consumption_en = pd.concat([df_all_sector_en, df_region_en, df_capita_en, df_all_industrial_en, df_growth_en, df_residential_en, df_composition_en])
df_consumption_en.reset_index(inplace=True)
df_consumption_en.drop(columns='index', inplace=True)

df_consumption_cn = pd.concat([df_all_sector_cn, df_region_cn, df_capita_cn, df_all_industrial_cn, df_growth_cn, df_residential_cn, df_composition_cn])
df_consumption_cn.reset_index(inplace=True)
df_consumption_cn.drop(columns='index', inplace=True)

In [769]:
df_consumption_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Consumption,Loss Caused by Natural Disasters by region.xls,Energy Consumption by Sector,Energy Consumption,Coal,Total Consumption ...,All,,China,Country,1995,137676,10e+4 tons


In [770]:
df_consumption_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源消耗,Loss Caused by Natural Disasters by region.xls,部门能源消耗,能源消耗,煤炭,消 费 总 量,所有,,中国,国家,1995,137676,万吨


**Save tables**

In [154]:
df_consumption_en.to_csv('../data/Clean_data/Energy/1_Energy_Consumption/Energy_Consumption_en.csv')
df_consumption_cn.to_csv('../data/Clean_data/Energy/1_Energy_Consumption/Energy_Consumption_cn.csv')

### 1.3 Energy Supply
**Datasets:**
- Basic Statistics on Heating Supply in Cities by Region
    - Basic Statistics on Heating Supply in Cities by Region
- Basic Statistics on Supply in Cities by Fuel Type and Region
    - Basic Statistics on Supply of Coal Gas in Cities by Region
    - Basic Statistics on Supply of LPG in Cities by Region
    - Basic Statistics on Supply of Natural Gas in Cities by Region
- Production by Fuel Type and Region    
    - Coke Production by Region
    - Crude Oil Production by Region
    - Diesel Oil Production by Region
    - Fuel Oil Production by Region
    - Gasoline Production by Region
    - Kerosene Production by Region
    - Natural Gas Production by Region
    - Raw Coal Production by Region
- Energy Production Per Capita
    - Energy Production Per Capita
- Primary Energy Production and Composition    
    - Primary Energy Production and Composition
- Growth Rate of Energy Production Compared With Growth Rate of GDP    
    - Growth Rate of Energy Production Compared With Growth Rate of GDP

#### 1.3.1 Basic Statistics on Heating Supply in Cities by Region
**Rearange tables**

In [281]:
file_name = '2Basic Statistics on Heating Supply in Cities by Region.xls'
df_heating = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=3)
df_heating.rename(columns=dict(zip(['Unnamed: 0', 'Unnamed: 1'], list(df_heating[['Unnamed: 0', 'Unnamed: 1']].iloc[0]))), inplace=True)
                
df_steam = df_heating[['地   区', 'Region', '蒸汽供应能力(吨/小时)', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']]
df_steam = df_steam.drop(index=[0])
df_steam['地   区'] = df_steam['地   区'].fillna('地区')
df_steam['Region'] = df_steam['Region'].fillna('Region')
df_steam.columns = list(df_steam.iloc[0])
df_steam = df_steam.drop(index=[1])

df_water = df_heating[['地   区', 'Region', '热水供应能力(兆瓦)', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21']]
df_water = df_water.drop(index=[0])
df_water['地   区'] = df_water['地   区'].fillna('地区')
df_water['Region'] = df_water['Region'].fillna('Region')
df_water.columns = list(df_water.iloc[0])
df_water = df_water.drop(index=[1])

indicator_df = {'Capacity of Steam Supply': df_steam,
               'Capacity of Hot Water Supply': df_water}
indicator_dic = {'Capacity of Steam Supply': '蒸汽供应能力',
               'Capacity of Hot Water Supply': '热水供应能力'}
indicator_unit_en = {'Capacity of Steam Supply': 'ton/hour',
                  'Capacity of Hot Water Supply': '10e+6 W'}
indicator_unit_cn = {'Capacity of Steam Supply': '吨/小时',
                  'Capacity of Hot Water Supply': '兆瓦'}


df_heating_supply_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_heating_supply_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for indicator in indicator_df.keys():
    df_heating = indicator_df[indicator]
    ### English
    df_heating_en = df_heating[['Region', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
    df_heating_en['Region Type'] = ['Country']+['Province']*int(len(df_heating_en)-1)
    df_heating_en = df_heating_en[['Region', 'Region Type', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
    
    #Unpivot fuel type columns
    df_heating_en = df_heating_en.melt(list(df_heating_en.columns)[:2], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_heating_en['Group'] = 'Energy'
    df_heating_en['Sub-group'] = 'Energy Supply'
    df_heating_en['Source'] = file_name
    df_heating_en['Dataset'] = 'Basic Statistics on Heating Supply in Cities by Region'
    df_heating_en['Indicator'] = indicator
    df_heating_en['Category 1'] = ''
    df_heating_en['Category 2'] = ''
    df_heating_en['Category 3'] = ''
    df_heating_en['Category 4'] = ''
    df_heating_en['Unit'] = indicator_unit_en[indicator]
    df_heating_en = df_heating_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_heating_supply_en = pd.concat([df_heating_supply_en, df_heating_en])
     
    ### Chinese
    df_heating_cn = df_heating[['地区', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
    df_heating_cn.rename(columns={'地区': 'Region'}, inplace=True)
    df_heating_cn['Region Type'] = ['国家']+['省份']*int(len(df_heating_cn)-1)
    df_heating_cn = df_heating_cn[['Region', 'Region Type', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
    
    #Unpivot fuel type columns
    df_heating_cn = df_heating_cn.melt(list(df_heating_cn.columns)[:2], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_heating_cn['Group'] = '能源'
    df_heating_cn['Sub-group'] = '能源供应'
    df_heating_cn['Source'] = file_name
    df_heating_cn['Dataset'] = '分地区城市集中供热情况'
    df_heating_cn['Indicator'] = indicator_dic[indicator]
    df_heating_cn['Category 1'] = ''
    df_heating_cn['Category 2'] = ''
    df_heating_cn['Category 3'] = ''
    df_heating_cn['Category 4'] = ''
    df_heating_cn['Unit'] = indicator_unit_cn[indicator]
    df_heating_cn = df_heating_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
        
    df_heating_supply_cn = pd.concat([df_heating_supply_cn, df_heating_cn])
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [284]:
df_heating_supply_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Basic Statistics on Heating Supply in Cities ...,Basic Statistics on Heating Supply in Cities b...,Capacity of Steam Supply,,,,,China,Country,2000,74148,ton/hour


In [286]:
df_heating_supply_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源供应,2Basic Statistics on Heating Supply in Cities ...,分地区城市集中供热情况,蒸汽供应能力,,,,,中国,国家,2000,74148,吨/小时


#### 1.3.2 Basic Statistics on Supply in Cities by Fuel Type and Region
**Rearange tables**

In [307]:
category_1_dic = {'Coal Gas': '煤气',
                  'LPG': '液化石油气',
                  'Natural Gas': '天然气'}

category_1_unit_en = {'Coal Gas': {'Total Gas Supply': '10e+4 cu.m','Population with Access': '10e+4 persons'},
                  'LPG': {'Total Gas Supply': 'ton','Population with Access': '10e+4 persons'},
                  'Natural Gas': {'Total Gas Supply': '10e+4 cu.m','Population with Access': '10e+4 persons'}}

category_1_unit_cn = {'Coal Gas': {'Total Gas Supply': '万立方米','Population with Access': '万人'},
                  'LPG': {'Total Gas Supply': '吨','Population with Access': '万人'},
                  'Natural Gas': {'Total Gas Supply': '万立方米','Population with Access': '万人'}}
    
df_fuel_supply_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_fuel_supply_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
    
for category_1 in category_1_dic.keys():
    file_name = f'2Basic Statistics on Supply of {category_1} in Cities by Region.xls'
    
    df_supply = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=3)
    df_supply.rename(columns=dict(zip(['Unnamed: 0', 'Unnamed: 1'], list(df_supply[['Unnamed: 0', 'Unnamed: 1']].iloc[0]))), inplace=True)
                    
    df_total = df_supply[['地  区', 'Region', '供气总量('+category_1_unit_cn[category_1]['Total Gas Supply']+')', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']]
    df_total = df_total.drop(index=[0])
    df_total['地  区'] = df_total['地  区'].fillna('地区')
    df_total['Region'] = df_total['Region'].fillna('Region')
    df_total.columns = list(df_total.iloc[0])
    df_total = df_total.drop(index=[1])
    
    df_population = df_supply[['地  区', 'Region', '用气人口(万人)', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21']]
    df_population = df_population.drop(index=[0])
    df_population['地  区'] = df_population['地  区'].fillna('地区')
    df_population['Region'] = df_population['Region'].fillna('Region')
    df_population.columns = list(df_population.iloc[0])
    df_population = df_population.drop(index=[1])
    
    indicator_df = {'Total Gas Supply': df_total,
                   'Population with Access': df_population}
    indicator_dic = {'Total Gas Supply': '供气总量',
                   'Population with Access': '用气人口'}
    indicator_unit_en = category_1_unit_en[category_1]
    indicator_unit_cn = category_1_unit_cn[category_1]
    
    
    df_supply_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
    df_supply_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
    
    for indicator in indicator_df.keys():
        df = indicator_df[indicator]
        ### English
        df_en = df[['Region', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
        df_en['Region Type'] = ['Country']+['Province']*int(len(df_en)-1)
        df_en = df_en[['Region', 'Region Type', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
        
        #Unpivot fuel type columns
        df_en = df_en.melt(list(df_en.columns)[:2], var_name='Year', value_name='Value')
        
        #Add additional columns
        df_en['Group'] = 'Energy'
        df_en['Sub-group'] = 'Energy Supply'
        df_en['Source'] = file_name
        df_en['Dataset'] = 'Basic Statistics on Heating Supply in Cities by Region'
        df_en['Indicator'] = indicator
        df_en['Category 1'] = category_1
        df_en['Category 2'] = ''
        df_en['Category 3'] = ''
        df_en['Category 4'] = ''
        df_en['Unit'] = indicator_unit_en[indicator]
        df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
        
        df_supply_en = pd.concat([df_supply_en, df_en])
         
        ### Chinese
        df_cn = df[['地区', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
        df_cn.rename(columns={'地区': 'Region'}, inplace=True)
        df_cn['Region Type'] = ['国家']+['省份']*int(len(df_cn)-1)
        df_cn = df_cn[['Region', 'Region Type', 2000, 2005.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0]]
        
        #Unpivot fuel type columns
        df_cn = df_cn.melt(list(df_cn.columns)[:2], var_name='Year', value_name='Value')
        
        #Add additional columns
        df_cn['Group'] = '能源'
        df_cn['Sub-group'] = '能源供应'
        df_cn['Source'] = file_name
        df_cn['Dataset'] = '分地区城市集中供热情况'
        df_cn['Indicator'] = indicator_dic[indicator]
        df_cn['Category 1'] = category_1_dic[category_1]
        df_cn['Category 2'] = ''
        df_cn['Category 3'] = ''
        df_cn['Category 4'] = ''
        df_cn['Unit'] = indicator_unit_cn[indicator]
        df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
        df_supply_cn = pd.concat([df_supply_cn, df_cn])
        
    df_fuel_supply_en = pd.concat([df_fuel_supply_en, df_supply_en])
    df_fuel_supply_cn = pd.concat([df_fuel_supply_cn, df_supply_cn])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [314]:
df_fuel_supply_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Basic Statistics on Supply of Coal Gas in Cit...,Basic Statistics on Heating Supply in Cities b...,Total Gas Supply,Coal Gas,,,,China,Country,2000,1523615,10e+4 cu.m


In [315]:
df_fuel_supply_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源供应,2Basic Statistics on Supply of Coal Gas in Cit...,分地区城市集中供热情况,供气总量,煤气,,,,中国,国家,2000,1523615,万立方米


#### 1.3.3 Production by Fuel Type and Region  
**Rearange tables**

In [340]:
fuel_types = {"Coke": '2Coke Production by Region.xls',
              "Raw Coal": '2Raw Coal Production by Region.xls',
              "Crude Oil": '2Crude Oil Production by Region.xls',
              "Diesel Oil": '2Diesel Oil Production by Region.xls',
              "Fuel Oil": '2Fuel Oil Production by Region.xls',
              "Gasoline": '2Gasoline Production by Region.xls',
              "Kerosene": '2Kerosene Production by Region.xls',
              "Natural Gas": '2Natural Gas Production by Region.xls'}

fuel_types_dict = {"Coke": '焦炭',
                   "Raw Coal": '原煤',
                   "Crude Oil": '原油',
                   "Diesel Oil": '柴油',
                   "Fuel Oil": '燃料油',
                   "Gasoline": '汽油',
                   "Kerosene": '煤油',
                   "Natural Gas": '天然气'}

fuel_types_unit_en = {"Coke": '10e+4 tons',
                      "Raw Coal": '10e+4 tons',
                      "Crude Oil": '10e+4 tons',
                      "Diesel Oil": '10e+4 tons',
                      "Fuel Oil": '10e+4 tons',
                      "Gasoline": '10e+4 tons',
                      "Kerosene": '10e+4 tons',
                      "Natural Gas": '10e+8 cu.m'}

fuel_types_unit_cn = {"Coke": '万吨',
                      "Raw Coal": '万吨',
                      "Crude Oil": '万吨',
                      "Diesel Oil": '万吨',
                      "Fuel Oil": '万吨',
                      "Gasoline": '万吨',
                      "Kerosene": '万吨',
                      "Natural Gas": '亿立方米'}

In [342]:
df_fuel_production_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_fuel_production_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in fuel_types.keys():
    category_1_cn = fuel_types_dict[category_1_en]
    print(category_1_en)
    
    file_name = f'2{category_1_en} Production by Region.xls'
    
    df = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='地  区')
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Supply'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Production by Fuel Type and Region'
    df_en['Indicator'] = 'Production'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = fuel_types_unit_en[category_1_en]
    df_en['Region Type'] = 'Province'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_fuel_production_en = pd.concat([df_fuel_production_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Region')
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    df_cn.rename(columns={'地  区': 'Region'}, inplace=True)
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源供应'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '按燃料类型和地区生产量'
    df_cn['Indicator'] = '生产量'
    df_cn['Category 1'] = category_1_cn
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = fuel_types_unit_cn[category_1_en]
    df_cn['Region Type'] = '省份'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_fuel_production_cn = pd.concat([df_fuel_production_cn, df_cn])

Coke
Raw Coal
Crude Oil
Diesel Oil
Fuel Oil
Gasoline
Kerosene
Natural Gas


In [347]:
df_fuel_production_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Coke Production by Region.xls,Production by Fuel Type and Region,Production,Coke,,,,Beijing,Province,1995,400.8664,10e+4 tons


In [349]:
df_fuel_production_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Coke Production by Region.xls,Production by Fuel Type and Region,Production,Coke,,,,Beijing,Province,1995,400.8664,10e+4 tons


#### 1.3.4 Energy Production Per Capita 
**Rearange tables**

In [418]:
file_name = f'2Energy Production Per Capita.xls'

df = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=5)

## English
df_capita_en = df.drop(index=[0])
df_capita_en['年  份'] = df_capita_en['年  份'].fillna('')
df_capita_en.columns = [x+ ' ' + y for x, y in zip(list(df_capita_en.iloc[0]), list(df_capita_en.iloc[1]))]
df_capita_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_capita_en.drop(index=[1,2], inplace=True)
df_capita_en.reset_index(inplace=True)
df_capita_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_capita_en = df_capita_en.melt(list(df_capita_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_capita_en['Unit'] = df_capita_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_capita_en['Category 1'] = df_capita_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_capita_en['Group'] = 'Energy'
df_capita_en['Sub-group'] = 'Energy Supply'
df_capita_en['Source'] = file_name
df_capita_en['Dataset'] = 'Energy Production Per Capita'
df_capita_en['Indicator'] = 'Per-Capita Energy Production'
df_capita_en['Category 2'] = ''
df_capita_en['Category 3'] = ''
df_capita_en['Category 4'] = ''
df_capita_en['Region'] = 'China'
df_capita_en['Region Type'] = 'Country'
df_capita_en = df_capita_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
        
## Chinese
df_capita_cn = df.drop(index=[1, 2])
df_capita_cn['年  份'] = df_capita_cn['年  份'].fillna('')
df_capita_cn.columns = [x+ ' ' + y for x, y in zip(list(df_capita_cn.columns), list(df_capita_cn.iloc[0]))]
df_capita_cn.rename(columns={'年  份 ': 'Year'}, inplace=True)
df_capita_cn.drop(index=[0], inplace=True)
df_capita_cn.reset_index(inplace=True)
df_capita_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_capita_cn = df_capita_cn.melt(list(df_capita_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_capita_cn['Unit'] = df_capita_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_capita_cn['Category 1'] = df_capita_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Add additional columns
df_capita_cn['Group'] = '能源'
df_capita_cn['Sub-group'] = '能源供应'
df_capita_cn['Source'] = file_name
df_capita_cn['Dataset'] = '人均能源生产量'
df_capita_cn['Indicator'] = '人均能源生产量'
df_capita_cn['Category 2'] = ''
df_capita_cn['Category 3'] = ''
df_capita_cn['Category 4'] = ''
df_capita_cn['Region'] = '中国'
df_capita_cn['Region Type'] = '国家'
df_capita_cn = df_capita_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]


In [419]:
df_capita_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Energy Production Per Capita.xls,Energy Production Per Capita,Per-Capita Energy Production,Total Energy,,,,China,Country,1980,649.539,kgce


In [420]:
df_capita_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源供应,2Energy Production Per Capita.xls,人均能源生产量,人均能源生产量,能源总量,,,,中国,国家,1980,649.539,千克标准煤


#### 1.3.5 Primary Energy Production and Composition 
**Rearange tables**

In [400]:
file_name = '2Primary Energy Production and Composition.xls'
df_composition = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=3)

df_calorific_value_tot = df_composition[['Unnamed: 0', '电热当量计算法  calorific value calculation']]
df_calorific_value_per = df_composition[['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']]

df_coal_equivalent_tot = df_composition[['Unnamed: 0', '发电煤耗计算法  coal equivalent calculation']]
df_coal_equivalent_per = df_composition[['Unnamed: 0', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']]


category_1_df = {'calorific value calculation': {'Total': df_calorific_value_tot, 'Percentage': df_calorific_value_per},
                 'coal equivalent calculation': {'Total': df_calorific_value_tot, 'Percentage': df_calorific_value_per}}

category_1_dic = {'calorific value calculation': '电热当量计算法',
                 'coal equivalent calculation': '发电煤耗计算法'}

category_2_dic = {'Primary Energy Production (104 tce)': '一次能源生产量 (万吨标准煤)',
                 'As percentage of primary energy production (%)': '占能源生产总量的比重 (%)'}

### Total Energy Consumption
df_total_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_total_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1 in category_1_df.keys():
    df_tot = category_1_df[category_1]['Total']
    
    ## English
    df_tot_en = df_tot.drop(index=[0, 1, 4, 5, 6, 7])
    df_tot_en['Unnamed: 0'] = df_tot_en['Unnamed: 0'].fillna('')
    df_tot_en.columns = [x+ ' ' + y for x, y in zip(list(df_tot_en.iloc[0]), list(df_tot_en.iloc[1]))]
    df_tot_en.rename(columns={' ': 'Year'}, inplace=True)
    df_tot_en.drop(index=[2,3], inplace=True)
    df_tot_en.reset_index(inplace=True)
    df_tot_en.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_tot_en = df_tot_en.melt(list(df_tot_en.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_tot_en['Unit'] = df_tot_en['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_tot_en['Category 2'] = df_tot_en['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Change unit format
    df_tot_en['Unit'] = df_tot_en['Unit'].apply(lambda x: change_units(x))
    
    #Add additional columns
    df_tot_en['Group'] = 'Energy'
    df_tot_en['Sub-group'] = 'Energy Supply'
    df_tot_en['Source'] = file_name
    df_tot_en['Dataset'] = 'Primary Energy Production and Composition'
    df_tot_en['Indicator'] = 'Energy Production'
    df_tot_en['Category 1'] = category_1
    df_tot_en['Category 3'] = ''
    df_tot_en['Category 4'] = ''
    df_tot_en['Region'] = 'China'
    df_tot_en['Region Type'] = 'Country'
    df_tot_en = df_tot_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_total_en = pd.concat([df_total_en, df_tot_en])
    
    ## Chinese
    df_tot_cn = df_tot.drop(index=[2, 3, 4, 5, 6, 7])
    df_tot_cn['Unnamed: 0'] = df_tot_cn['Unnamed: 0'].fillna('')
    df_tot_cn.columns = [x+ ' ' + y for x, y in zip(list(df_tot_cn.iloc[0]), list(df_tot_cn.iloc[1]))]
    df_tot_cn.rename(columns={' 年  份': 'Year'}, inplace=True)
    df_tot_cn.drop(index=[0,1], inplace=True)
    df_tot_cn.reset_index(inplace=True)
    df_tot_cn.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_tot_cn = df_tot_cn.melt(list(df_tot_cn.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_tot_cn['Unit'] = df_tot_cn['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_tot_cn['Category 2'] = df_tot_cn['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Add additional columns
    df_tot_cn['Group'] = '能源'
    df_tot_cn['Sub-group'] = '能源供应'
    df_tot_cn['Source'] = file_name
    df_tot_cn['Dataset'] = '一次能源生产量和构成'
    df_tot_cn['Indicator'] = '能源生产'
    df_tot_cn['Category 1'] = category_1_dic[category_1]
    df_tot_cn['Category 3'] = ''
    df_tot_cn['Category 4'] = ''
    df_tot_cn['Region'] = '中国'
    df_tot_cn['Region Type'] = '国家'
    df_tot_cn = df_tot_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_total_cn = pd.concat([df_total_cn, df_tot_cn])
    
### Percentage of Primary Energy Consumption
df_percentage_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_percentage_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1 in category_1_df.keys():
    df_per = category_1_df[category_1]['Percentage']

    ## English
    df_per_en = df_per.drop(index=[0, 1, 2, 3, 4, 6])
    df_per_en.fillna('', inplace=True)
    df_per_en.columns = [x+ ' ' + y for x, y in zip(list(df_per_en.iloc[0]), list(df_per_en.iloc[1]))]
    df_per_en.rename(columns={' ': 'Year'}, inplace=True)
    df_per_en.drop(index=[5,7], inplace=True)
    df_per_en.reset_index(inplace=True)
    df_per_en.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_per_en = df_per_en.melt(list(df_per_en.columns)[0], var_name='Category 3', value_name='Value')
    
    #Rearange categories
    category_4_en = {'Raw Coal ': '', 'Crude Oil ': '', 'Natural Gas ': '',
           'Primary Electricty and Other Energy ': 'All', ' Hydro Power': 'Hydro Power',
           ' Nuclear Power': 'Nuclear Power'}
    
    category_3_en = {'Raw Coal ': 'Raw Coal', 'Crude Oil ': 'Crude Oil', 'Natural Gas ': 'Natural Gas',
           'Primary Electricty and Other Energy ': 'Primary Electricty and Other Energy', ' Hydro Power': 'Primary Electricty and Other Energy',
           ' Nuclear Power': 'Primary Electricty and Other Energy'}
    
    df_per_en['Category 4'] = df_per_en['Category 3'].apply(lambda x: category_4_en[x])
    df_per_en['Category 3'] = df_per_en['Category 3'].apply(lambda x: category_3_en[x])
    
    #Add additional columns
    df_per_en['Group'] = 'Energy'
    df_per_en['Sub-group'] = 'Energy Supply'
    df_per_en['Source'] = file_name
    df_per_en['Dataset'] = 'Primary Energy Production and Composition'
    df_per_en['Indicator'] = 'Energy Production'
    df_per_en['Category 1'] = category_1
    df_per_en['Category 2'] = 'percentage of primary energy consumption'
    df_per_en['Region'] = 'China'
    df_per_en['Region Type'] = 'Country'
    df_per_en['Unit'] = '%'
    df_per_en = df_per_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
        
    df_percentage_en = pd.concat([df_percentage_en, df_per_en])
    
    ## Chinese
    df_per_cn = df_per.drop(index=[0, 1, 2, 3, 5, 7])
    df_per_cn.fillna('', inplace=True)
    df_per_cn.columns = [x+ ' ' + y for x, y in zip(list(df_per_cn.iloc[0]), list(df_per_cn.iloc[1]))]
    df_per_cn.rename(columns={' Year': 'Year'}, inplace=True)
    df_per_cn.drop(index=[4,6], inplace=True)
    df_per_cn.reset_index(inplace=True)
    df_per_cn.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_per_cn = df_per_cn.melt(list(df_per_cn.columns)[0], var_name='Category 3', value_name='Value')
    
    #Rearange categories
    category_4_cn = {'原  煤 ': '', '原  油 ': '', '天然气 ': '', '一次电力及 其他能源 ':'所有', ' 水电': '水电', ' 核电': '核电'}
    category_3_cn = {'原  煤 ': '原煤', '原  油 ': '原 油', '天然气 ': '天然气', '一次电力及 其他能源 ':'一次电力及 其他能源', ' 水电': '一次电力及 其他能源', ' 核电': '一次电力及 其他能源'}
    
    df_per_cn['Category 4'] = df_per_cn['Category 3'].apply(lambda x: category_4_cn[x])
    df_per_cn['Category 3'] = df_per_cn['Category 3'].apply(lambda x: category_3_cn[x])
    
    #Add additional columns
    df_per_cn['Group'] = '能源'
    df_per_cn['Sub-group'] = '能源供应'
    df_per_cn['Source'] = file_name
    df_per_cn['Dataset'] = '一次能源生产量和构成'
    df_per_cn['Indicator'] = '能源生产'
    df_per_cn['Category 1'] = category_1_dic[category_1]
    df_per_cn['Category 2'] = '占能源消费总量的比重'
    df_per_cn['Region'] = '中国'
    df_per_cn['Region Type'] = '国家'
    df_per_cn['Unit'] = '%'
    df_per_cn = df_per_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_percentage_cn = pd.concat([df_percentage_cn, df_per_cn])
    
df_composition_en = pd.concat([df_total_en, df_percentage_en])
df_composition_cn = pd.concat([df_total_cn, df_percentage_cn])

In [403]:
df_composition_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Primary Energy Production and Composition.xls,Primary Energy Production and Composition,Energy Production,calorific value calculation,Primary Energy Production,,,China,Country,1980,62046,10e+4 tce


In [404]:
df_composition_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源供应,2Primary Energy Production and Composition.xls,一次能源生产量和构成,能源生产,电热当量计算法,一次能源生产量,,,中国,国家,1980,62046,万吨标准煤


#### 1.3.6 Growth Rate of Energy Production Compared With Growth Rate of GDP 
**Rearange tables**

In [None]:
file_name = '2Primary Energy Production and Composition.xls'
df_composition = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=3)


In [413]:
file_name = '2Growth Rate of Energy Production Compared With Growth Rate of GDP.xls'
df_growth = pd.read_excel(f'../data/Energy data/2Energy Supplyƒ‹‘¥π©”¶/{file_name}', skiprows=3)

## English
df_growth_en = df_growth.drop(index=[0])
df_growth_en['年 份'] = df_growth_en['年 份'].fillna('')
df_growth_en.columns = [x+ ' ' + y for x, y in zip(list(df_growth_en.iloc[0]), list(df_growth_en.iloc[1]))]
df_growth_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_growth_en.drop(index=[1,2], inplace=True)
df_growth_en.reset_index(inplace=True)
df_growth_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_growth_en = df_growth_en.melt(list(df_growth_en.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_growth_en['Unit'] = df_growth_en['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_growth_en['Category 1'] = df_growth_en['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Separate Indicator from Category 1
df_growth_en['Indicator'] = df_growth_en['Category 1'].apply(lambda x: x.split('of')[0][:-1])
df_growth_en['Category 1'] = df_growth_en['Category 1'].apply(lambda x: x.split('of')[1][1:])

#Add additional columns
df_growth_en['Group'] = 'Energy'
df_growth_en['Sub-group'] = 'Energy Supply'
df_growth_en['Source'] = file_name
df_growth_en['Dataset'] = 'Growth Rate of Energy Production Compared With Growth Rate of GDP'
df_growth_en['Category 2'] = ''
df_growth_en['Category 3'] = ''
df_growth_en['Category 4'] = ''
df_growth_en['Region'] = 'China'
df_growth_en['Region Type'] = 'Country'
df_growth_en = df_growth_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_growth_cn = df_growth.drop(index=[1,2])
df_growth_cn['年 份'] = df_growth_cn['年 份'].fillna('')
df_growth_cn.columns = [x+ ' ' + y for x, y in zip(list(df_growth_cn.columns), list(df_growth_cn.iloc[0]))]
df_growth_cn.rename(columns={'年 份 ': 'Year'}, inplace=True)
df_growth_cn.drop(index=[0], inplace=True)
df_growth_cn.reset_index(inplace=True)
df_growth_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_growth_cn = df_growth_cn.melt(list(df_growth_cn.columns)[0], var_name='Category 1', value_name='Value')

#Separate Unit from category
df_growth_cn['Unit'] = df_growth_cn['Category 1'].apply(lambda x: x.split('(')[1][:-1])
df_growth_cn['Category 1'] = df_growth_cn['Category 1'].apply(lambda x: x.split('(')[0][:-1])

#Separate Indicator from Category 1
df_growth_cn['Indicator'] = df_growth_cn['Category 1'].apply(lambda x: split_indicator(x))
df_growth_cn['Category 1'] = df_growth_cn['Category 1'].apply(lambda x: split_category(x))

#Add additional columns
df_growth_cn['Group'] = '能源'
df_growth_cn['Sub-group'] = '能源供应'
df_growth_cn['Source'] = file_name
df_growth_cn['Dataset'] = '能源生产与国内生产总值增长速度'
df_growth_cn['Category 2'] = ''
df_growth_cn['Category 3'] = ''
df_growth_cn['Category 4'] = ''
df_growth_cn['Region'] = '中国'
df_growth_cn['Region Type'] = '国家'
df_growth_cn = df_growth_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [416]:
df_growth_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Supply,2Growth Rate of Energy Production Compared Wit...,Growth Rate of Energy Production Compared With...,Growth Rate,GDP,,,,China,Country,1980,7.8,%


In [417]:
df_growth_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源供应,2Growth Rate of Energy Production Compared Wit...,能源生产与国内生产总值增长速度,增长速度,国内生产总值,,,,中国,国家,1980,7.8,%


#### **Concatenate all datasets**

In [421]:
df_supply_en = pd.concat([df_heating_supply_en, df_fuel_supply_en, df_fuel_production_en, df_capita_en, df_composition_en, df_growth_en])
df_supply_en.reset_index(inplace=True)
df_supply_en.drop(columns='index', inplace=True)

df_supply_cn = pd.concat([df_heating_supply_cn, df_fuel_supply_cn, df_fuel_production_cn, df_capita_cn, df_composition_cn, df_growth_cn])
df_supply_cn.reset_index(inplace=True)
df_supply_cn.drop(columns='index', inplace=True)

**Save tables**

In [422]:
df_supply_en.to_csv('../data/Clean_data/Energy/2_Energy_Supply/Energy_Supply_en.csv')
df_supply_cn.to_csv('../data/Clean_data/Energy/2_Energy_Supply/Energy_Supply_cn.csv')

### 1.4 Energy Efficiency
**Datasets:**
- Efficiency of Energy Transformation
- Energy Intensity by GDP
- Energy efficiency

#### 1.4.1 Efficiency of Energy Transformation
**Rearange tables**

In [681]:
file_name = '3Efficiency of Energy Transformation.xls'
df_trasp = pd.read_excel(f'../data/Energy data/3Energy Efficiencyƒ‹‘¥–ß¬ /{file_name}', skiprows=4)

## English
df_trasp_en = df_trasp.copy()
df_trasp_en.columns = list(df_trasp_en.iloc[0])
df_trasp_en = df_trasp_en.drop(index=[0])
df_trasp_en.reset_index(inplace=True)
df_trasp_en.drop(columns='index', inplace=True)

#Unpivot categories from columns
df_trasp_en = df_trasp_en.melt(list(df_trasp_en.columns)[0], var_name='Category 1', value_name='Value')

#Add additional columns
df_trasp_en['Group'] = 'Energy'
df_trasp_en['Sub-group'] = 'Energy Efficiency'
df_trasp_en['Source'] = file_name
df_trasp_en['Dataset'] = 'Efficiency of Energy Transformation'
df_trasp_en['Indicator'] = 'Energy Transformation'
df_trasp_en['Category 2'] = ''
df_trasp_en['Category 3'] = ''
df_trasp_en['Category 4'] = ''
df_trasp_en['Region'] = 'China'
df_trasp_en['Region Type'] = 'Country'
df_trasp_en['Unit'] = '%'
df_trasp_en = df_trasp_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

## Chinese
df_trasp_cn = df_trasp.drop(index=[0])
df_trasp_cn.rename(columns = {'年 份': 'Year'}, inplace=True)

#Unpivot categories from columns
df_trasp_cn = df_trasp_cn.melt(list(df_trasp_cn.columns)[0], var_name='Category 1', value_name='Value')

#Add additional columns
df_trasp_cn['Group'] = '能源'
df_trasp_cn['Sub-group'] = '能源效率'
df_trasp_cn['Source'] = file_name
df_trasp_cn['Dataset'] = '能源加工转换效率'
df_trasp_cn['Indicator'] = '能源转型'
df_trasp_cn['Category 2'] = ''
df_trasp_cn['Category 3'] = ''
df_trasp_cn['Category 4'] = ''
df_trasp_cn['Region'] = '中国'
df_trasp_cn['Region Type'] = '国家'
df_trasp_cn['Unit'] = '%'
df_trasp_cn = df_trasp_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [682]:
df_trasp_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Efficiency,3Efficiency of Energy Transformation.xls,Efficiency of Energy Transformation,Energy Transformation,Total Efficiency,,,,China,Country,1980,69.54,%


In [683]:
df_trasp_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源效率,3Efficiency of Energy Transformation.xls,能源加工转换效率,能源转型,总效率,,,,中国,国家,1980,69.54,%


#### 1.4.2 Energy Intensity by GDP
**Rearange tables**

In [684]:
file_name = '3Energy Intensity by GDP.xls'
df_intensity = pd.read_excel(f'../data/Energy data/3Energy Efficiencyƒ‹‘¥–ß¬ /{file_name}', skiprows=3)

category_1_en = {'GDP is calculated at 1980 constant prices': [2,12],
                'GDP is calculated at 1990 constant prices': [15,25],
                'GDP is calculated at 2000 constant prices': [28,33],
                'GDP is calculated at 2010 constant prices': [36,41],
                'GDP is calculated at 2015 constant prices': [44,49],
                'GDP is calculated at 2015 constant prices': [52,54]}

category_1_dic = {'GDP is calculated at 1980 constant prices': '国内生产总值按1980年可比价格计算',
                'GDP is calculated at 1990 constant prices': '国内生产总值按1990年可比价格计算',
                'GDP is calculated at 2000 constant prices': '国内生产总值按2000年可比价格计算',
                'GDP is calculated at 2010 constant prices': '国内生产总值按2005年可比价格计算',
                'GDP is calculated at 2015 constant prices': '国内生产总值按2010年可比价格计算',
                'GDP is calculated at 2015 constant prices': '国内生产总值按2015年可比价格计算'}

## English
df_all_en = df_intensity.drop(index=[0])
df_all_en['年  份'] = df_all_en['年  份'].fillna('')
df_all_en.columns = [x+ ' ' + y for x, y in zip(list(df_all_en.iloc[0]), list(df_all_en.iloc[1]))]
df_all_en.rename(columns={'Year ': 'Year'}, inplace=True)
df_all_en.drop(index=[1,2], inplace=True)
df_all_en.reset_index(inplace=True)
df_all_en.drop(columns='index', inplace=True)

## Chinese
df_all_cn = df_intensity.drop(index=[1,2])
df_all_cn['年  份'] = df_all_cn['年  份'].fillna('')
df_all_cn.columns = [x+ ' ' + y for x, y in zip(list(df_all_cn.columns), list(df_all_cn.iloc[0]))]
df_all_cn.rename(columns={'年  份 ': 'Year'}, inplace=True)
df_all_cn.drop(index=[0], inplace=True)
df_all_cn.reset_index(inplace=True)
df_all_cn.drop(columns='index', inplace=True)

df_intensity_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_intensity_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1 in category_1_en.keys():
    index = category_1_en[category_1]
    ## English
    df_en = df_all_en.iloc[index[0]:index[1]+1]
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_en['Unit'] = df_en['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_en['Category 2'] = df_en['Category 2'].apply(lambda x: x.split('(')[0][:-1])

    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Efficiency'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Energy Intensity by GDP'
    df_en['Indicator'] = 'Energy Intensity by GDP'
    df_en['Category 1'] = category_1
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Region'] = 'China'
    df_en['Region Type'] = 'Country'
    df_en['Unit'] = '%'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    ## Chinese
    df_cn = df_all_cn.iloc[index[0]:index[1]+1]
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Category 2', value_name='Value')
    
    #Separate Unit from category
    df_cn['Unit'] = df_cn['Category 2'].apply(lambda x: x.split('(')[1][:-1])
    df_cn['Category 2'] = df_cn['Category 2'].apply(lambda x: x.split('(')[0][:-1])
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源效率'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '平均每万元国内生产总值能源消费量'
    df_cn['Indicator'] = '平均每万元国内生产总值能源消费量'
    df_cn['Category 1'] = category_1_dic[category_1]
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Region'] = '中国'
    df_cn['Region Type'] = '国家'
    df_cn['Unit'] = '%'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_intensity_en = pd.concat([df_intensity_en, df_en])
    df_intensity_cn = pd.concat([df_intensity_cn, df_cn])
    

In [685]:
df_intensity_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Efficiency,3Energy Intensity by GDP.xls,Energy Intensity by GDP,Energy Intensity by GDP,GDP is calculated at 1980 constant prices,Total Energy,,,China,Country,1980,13.14,%


In [686]:
df_intensity_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源效率,3Energy Intensity by GDP.xls,平均每万元国内生产总值能源消费量,平均每万元国内生产总值能源消费量,国内生产总值按1980年可比价格计算,总能量,,,中国,国家,1980,13.14,%


#### 1.4.3 Energy efficiency
**Rearange tables**

In [687]:
file_name = 'Energy efficiency°™°™Wang Qingyi.xlsx'

## English
df_efficiency_en = pd.read_excel(f'../data/Energy data/3Energy Efficiencyƒ‹‘¥–ß¬ /{file_name}', sheet_name='English-V1')
df_efficiency_en.rename(columns = {'Unnamed: 0': 'Dataset', 'Variable': 'Indicator'}, inplace=True)

## Chinese
df_efficiency_cn = pd.read_excel(f'../data/Energy data/3Energy Efficiencyƒ‹‘¥–ß¬ /{file_name}', sheet_name='Chinese-V1')
#df_efficiency_cn.rename(columns = {'类别': 'Dataset', '变量': 'Indicator', '单位': 'Unit'}, inplace=True)

In [688]:
pd.merge(df_efficiency_cn, df_efficiency_en, on = [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

Unnamed: 0,类别,变量,单位,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,Dataset,Indicator,Unit
0,能源效率,全社会节能量,Mtce,,,,,,630.0,71.3,135.1,135.0,214.8,240.9,218.1,166.2,142.4,Energy Efficiency,Total energy saving compared to the level of p...,Mtce
1,能源效率,技术节能量占比,%,,,,,,82.0,67.8,51.2,75.9,39.7,41.3,36.7,60.0,74.7,Energy Efficiency,Total energy saving/Technical energy saving,%
2,能源效率,制造业节能量占比,%,,,,,,,37.7,25.2,31.3,14.2,16.0,10.3,18.0,25.1,Energy Efficiency,Total energy saving/Technical energy saving/Ma...,%
3,能源效率,交通运输节能量占比,%,,,,,,,11.3,4.4,13.7,6.8,5.5,4.5,5.9,5.8,Energy Efficiency,Total energy saving/Technical energy saving/Tr...,%
4,能源效率,建筑节能量占比,%,,,,,,,18.8,21.6,30.8,18.6,19.8,21.8,36.1,43.8,Energy Efficiency,Total energy saving/Technical energy saving/Bu...,%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,能源效率,烧碱综合能耗,kgce/ t,1297.0,,1203.0,1124.0,1040.0,1006.0,1060.0,986.0,972.0,949.0,897.0,878.0,875.0,871.0,Energy Efficiency,Full energy consumption for caustic soda,kgce/t
81,能源效率,纯碱综合能耗,kgce/ t,396.0,,398.0,355.0,323.0,385.0,384.0,376.0,337.0,336.0,329.0,336.0,333.0,331.0,Energy Efficiency,Full energy consumption for sodium carbonate,kgce/t
82,能源效率,电石电耗,kWh/ t,3450.0,,3465.0,3440.0,3395.0,3340.0,3450.0,3360.0,3423.0,3272.0,3303.0,3224.0,3279.0,3208.0,Energy Efficiency,Electricity consumption for calcium carbide,kWh/t
83,能源效率,纸和纸板全行业/综合能耗,kgce/ t,528.0,,467.0,440.0,395.0,390.0,380.0,366.0,353.0,340.0,339.0,333.0,326.0,318.0,Energy Efficiency,Whole paper and paperboard industry/Full energ...,kgce/t


#### **Concatenate all datasets**

In [689]:
df_efficiency_en = pd.concat([df_trasp_en, df_intensity_en])
df_efficiency_en.reset_index(inplace=True)
df_efficiency_en.drop(columns='index', inplace=True)

df_efficiency_cn = pd.concat([df_trasp_cn, df_intensity_cn])
df_efficiency_cn.reset_index(inplace=True)
df_efficiency_cn.drop(columns='index', inplace=True)

**Save tables**

In [692]:
df_efficiency_en.to_csv('../data/Clean_data/Energy/3_Energy_Efficiency/Energy_Efficiency_en.csv')
df_efficiency_cn.to_csv('../data/Clean_data/Energy/3_Energy_Efficiency/Energy_Efficiency_cn.csv')

### 1.5 Energy Trade
**Datasets:**
- Imports and Exports of Major Energy Products

#### 1.5.1 Imports and Exports of Major Energy Products
**Rearange tables**

In [1206]:
file_name = '4Imports and Exports of Major Energy Products.xls'
df_trade = pd.read_excel(f'../data/Energy data/4Energy Tradeƒ‹‘¥√≥“◊/{file_name}', skiprows=3)

indicator_en = {'Import volume': [1,11],
                'Export volume': [13,23]}

indicator_dic = {'Import volume': '进口量',
                'Export volume': '出口量'}

## English
df_all_en = df_trade.drop(columns = '指  标')

## Chinese
df_all_cn = df_trade.drop(columns = 'Item')
df_all_cn.rename(columns={'指  标': 'Item'}, inplace=True)


df_trade_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_trade_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for indicator in indicator_en.keys():
    index = indicator_en[indicator]
    ## English
    df_en = df_all_en.iloc[index[0]:index[1]+1]
    
    #Separate Unit from category
    df_en['Category 1'] = df_en['Item'].apply(lambda x: x.split('(')[0][:-1])
    df_en['Unit'] = df_en['Item'].apply(lambda x: x.split('(')[1][:-1])
    df_en.drop(columns=['Item'], inplace=True)
    
    #Unpivot years
    df_en = df_en.melt(list(df_en.columns)[-2:], var_name='Year', value_name='Value')
    
    #Change unit format
    df_en['Unit'] = df_en['Unit'].apply(lambda x: change_units(x))
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Trade'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Imports and Exports of Major Energy Products'
    df_en['Indicator'] = indicator
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Region'] = 'China'
    df_en['Region Type'] = 'Country'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_trade_en = pd.concat([df_trade_en, df_en])
    
    ## Chinese
    df_cn = df_all_cn.iloc[index[0]:index[1]+1]
    
    #Separate Unit from category
    df_cn['Category 1'] = df_cn['Item'].apply(lambda x: x.split('(')[0][:-1])
    df_cn['Unit'] = df_cn['Item'].apply(lambda x: x.split('(')[1][:-1])
    df_cn.drop(columns=['Item'], inplace=True)
    
    #Unpivot years
    df_cn = df_cn.melt(list(df_cn.columns)[-2:], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源贸易'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '主要能源品种进、出口量'
    df_cn['Indicator'] = indicator_dic[indicator]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Region'] = '中国'
    df_cn['Region Type'] = '国家'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_trade_cn = pd.concat([df_trade_cn, df_cn])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['Category 1'] = df_en['Item'].apply(lambda x: x.split('(')[0][:-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['Unit'] = df_en['Item'].apply(lambda x: x.split('(')[1][:-1])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

In [1209]:
df_trade_en

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Import volume,Coal,,,,China,Country,2000,218.00,10e+4 tons
1,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Import volume,Coke,,,,China,Country,2000,,10e+4 tons
2,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Import volume,Crude Oil,,,,China,Country,2000,7027.00,10e+4 tons
3,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Import volume,Gasoline,,,,China,Country,2000,,10e+4 tons
4,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Import volume,Kerosene,,,,China,Country,2000,255.47,10e+4 tons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Export volume,Fuel Oil,,,,China,Country,2017,1109.00,10e+4 tons
106,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Export volume,LPG,,,,China,Country,2017,132.00,10e+4 tons
107,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Export volume,Other Petroleum Products,,,,China,Country,2017,338.00,10e+4 tons
108,Energy,Energy Trade,4Imports and Exports of Major Energy Products.xls,Imports and Exports of Major Energy Products,Export volume,Natural Gas,,,,China,Country,2017,35.00,10e+8 cu.m


#### **Concatenate all datasets**

**Save tables**

In [1248]:
df_trade_en.to_csv('../data/Clean_data/Energy/4_Energy_Trade/Energy_Trade_en.csv')
df_trade_cn.to_csv('../data/Clean_data/Energy/4_Energy_Trade/Energy_Trade_cn.csv')

### 1.6 Energy Investment
**Datasets:**
- Investment in Energy Industry
    - Investment in Energy Industry
    - Investment in Energy Industry by Proportions
    
- Investment In Fixed Assets of State-Owned Units in Energy Industry 
    - Investment In Fixed Assets of State-Owned Units in Energy Industry 
    - Proportions of Investment in Fixed Assets of State-Owned Units in Energy Industry
    
- Energy Investment by Region
    - Investment in Coal Mining and Processing by Region
    - Investment in Electricity, Steam, Hot Water Production and Supply by Region
    - Investment in Energy Industry by Region
    - Investment in Gas Production and Supply by Region   
    - Investment in Petroleum and Natural Gas Extraction by Region    
    - Investment in Petroleum Processing and Coking by Region 
    
- Investment in Fixed Assets of State-Owned Units by Region
    - Investment in Fixed Assets of State-Owned Units in Coal Mining and Processing by Region
    - Investment in Fixed Assets of State-Owned Units in Electricity, Steam, Hot Water Production and Supply by Region
    - Investment in Fixed Assets of State-Owned Units in Energy Industry by Region  
    - Investment in Fixed Assets of State-Owned Units in Gas Production and Supply by Region 
    - Investment in Fixed Assets of State-Owned Units in Petroleum and Natural Gas Extraction by Region
    - Investment in Fixed Assets of State-Owned Units in Petroleum Processing and Coking by Region
  
#### 1.6.1 Investment in Energy Industry
**Rearange tables**

In [720]:
indicators = {"Investment": '5Investment in Energy Industry.xls',
         "Investment ratio": '5Investment in Energy Industry by Proportions.xls'}

indicators_dict = {"Investment": '投资',
              "Investment ratio": '投资比例'}

unit_en = {"Investment": '100 million yuan',
           "Investment ratio": '%'}

unit_cn = {"Investment": '亿元',
           "Investment ratio": '%'}

dataset_en = {"Investment": 'Investment in Energy Industry',
           "Investment ratio": 'Investment Ratio in Energy Industry'}

dataset_cn = {"Investment": '能源工业分行业投资',
           "Investment ratio": '能源工业分行业投资构成'}

In [721]:
df_invesment_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_invesment_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for indicator in indicators.keys():
    print(indicator)
    
    file_name = indicators[indicator]
    
    df = pd.read_excel(f'../data/Energy data/5Energy Investmentƒ‹‘¥Õ∂◊ /{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='项目')
    df_en.rename(columns={'Item': 'Category 1'}, inplace=True)
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Investment'
    df_en['Source'] = file_name
    df_en['Dataset'] = dataset_en[indicator]
    df_en['Indicator'] = indicator
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en[indicator]
    df_en['Region'] = 'China'
    df_en['Region Type'] = 'Country'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_invesment_en = pd.concat([df_invesment_en, df_en])
        
    ### Chinese
    df_cn = df.drop(columns='Item')
    df_cn.rename(columns={'项目': 'Category 1'}, inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
       
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源投资'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = dataset_cn[indicator]
    df_cn['Indicator'] = indicators_dict[indicator]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn[indicator]
    df_cn['Region'] = '中国'
    df_cn['Region Type'] = '国家'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_invesment_cn = pd.concat([df_invesment_cn, df_cn])

Investment
Investment ratio


In [722]:
df_invesment_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Investment,5Investment in Energy Industry.xls,Investment in Energy Industry,Investment,Energy Industry,,,,China,Country,1995,2369.16,100 million yuan


In [723]:
df_invesment_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源投资,5Investment in Energy Industry.xls,能源工业分行业投资,投资,能源工业,,,,中国,国家,1995,2369.16,亿元


#### 1.6.2 Investment In Fixed Assets of State-Owned Units in Energy Industry
**Rearange tables**

In [724]:
indicators = {"Investment": '5Investment In Fixed Assets of State-Owned Units in Energy Industry.xls',
         "Investment ratio": '5Proportions of Investment in Fixed Assets of State-Owned Units in Energy Industry.xls'}

indicators_dict = {"Investment": '投资',
              "Investment ratio": '投资比例'}

unit_en = {"Investment": '100 million yuan',
           "Investment ratio": '%'}

unit_cn = {"Investment": '亿元',
           "Investment ratio": '%'}

dataset_en = {"Investment": 'Investment in Fixed Assets of State-Owned Units in Energy Industry',
           "Investment ratio": 'Investment Ratio in Fixed Assets of State-Owned Units in Energy Industry'}

dataset_cn = {"Investment": '国有经济能源工业分行业固定资产投资',
           "Investment ratio": '国有经济能源工业分行业固定资产投资构成'}

In [725]:
df_invesment_fixed_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_invesment_fixed_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for indicator in indicators.keys():
    print(indicator)
    
    file_name = indicators[indicator]
    
    df = pd.read_excel(f'../data/Energy data/5Energy Investmentƒ‹‘¥Õ∂◊ /{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='项目')
    df_en.rename(columns={'Item': 'Category 1'}, inplace=True)
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Investment'
    df_en['Source'] = file_name
    df_en['Dataset'] = dataset_en[indicator]
    df_en['Indicator'] = indicator
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en[indicator]
    df_en['Region'] = 'China'
    df_en['Region Type'] = 'Country'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_invesment_fixed_en = pd.concat([df_invesment_fixed_en, df_en])
        
    ### Chinese
    df_cn = df.drop(columns='Item')
    df_cn.rename(columns={'项目': 'Category 1'}, inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
       
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源投资'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = dataset_cn[indicator]
    df_cn['Indicator'] = indicators_dict[indicator]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn[indicator]
    df_cn['Region'] = '中国'
    df_cn['Region Type'] = '国家'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_invesment_fixed_cn = pd.concat([df_invesment_fixed_cn, df_cn])

Investment
Investment ratio


In [726]:
df_invesment_fixed_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Investment,5Investment In Fixed Assets of State-Owned Uni...,Investment in Fixed Assets of State-Owned Unit...,Investment,Energy Industry,,,,China,Country,1995,2025.28,100 million yuan


In [727]:
df_invesment_fixed_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源投资,5Investment In Fixed Assets of State-Owned Uni...,国有经济能源工业分行业固定资产投资,投资,能源工业,,,,中国,国家,1995,2025.28,亿元


#### 1.6.3 Energy Investment by Region
**Rearange tables**

In [728]:
types = {"Coal Mining and Processing": '5Investment in Coal Mining and Processing by Region.xls',
         "Electricity, Steam, Hot Water Production and Supply": '5Investment in Electricity, Steam, Hot Water Production and Supply by Region.xls',
         "Energy Industry": '5Investment in Energy Industry by Region.xls',
         "Gas Production and Supply": '5Investment in Gas Production and Supply by Region.xls',
         "Petroleum and Natural Gas Extraction": '5Investment in Petroleum and Natural Gas Extraction by Region.xls',
         "Petroleum Processing and Coking": '5Investment in Petroleum Processing and Coking by Region.xls'}

types_dict = {"Coal Mining and Processing": '煤炭采选',
              "Electricity, Steam, Hot Water Production and Supply": '电力、蒸汽、热水生产和供应',
              "Energy Industry": '能源工',
              "Gas Production and Supply": '煤气生产和供应',
              "Petroleum and Natural Gas Extraction": '石油和天然气开采',
              "Petroleum Processing and Coking": '石油加工及炼焦'}

unit_en = '100 million yuan'
unit_cn = '亿元'

In [729]:
df_invesment_region_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_invesment_region_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    category_1_cn = types_dict[category_1_en]
    print(category_1_en)
    
    file_name = f'5Investment in {category_1_en} by Region.xls'
    
    df = pd.read_excel(f'../data/Energy data/5Energy Investmentƒ‹‘¥Õ∂◊ /{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='地  区')
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Investment'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Energy Investment by Region'
    df_en['Indicator'] = 'Investment'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Province'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_invesment_region_en = pd.concat([df_invesment_region_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Region')
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    df_cn.rename(columns={'地  区': 'Region'}, inplace=True)
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源投资'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '分地区能源投资'
    df_cn['Indicator'] = '投资'
    df_cn['Category 1'] = category_1_cn
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '省份'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_invesment_region_cn = pd.concat([df_invesment_region_cn, df_cn])

Coal Mining and Processing
Electricity, Steam, Hot Water Production and Supply
Energy Industry
Gas Production and Supply
Petroleum and Natural Gas Extraction
Petroleum Processing and Coking


In [730]:
df_invesment_region_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Investment,5Investment in Coal Mining and Processing by R...,Energy Investment by Region,Investment,Coal Mining and Processing,,,,Beijing,Province,1995,0.6186,100 million yuan


In [731]:
df_invesment_region_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源投资,5Investment in Coal Mining and Processing by R...,分地区能源投资,投资,煤炭采选,,,,北 京,省份,1995,0.6186,亿元


#### 1.6.4 Investment in Fixed Assets of State-Owned Units by Region
**Rearange tables**

In [732]:
types = {"Coal Mining and Processing": '5Investment in Fixed Assets of State-Owned Units in Coal Mining and Processing by Region.xls',
         "Electricity, Steam, Hot Water Production and Supply": '5Investment in Fixed Assets of State-Owned Units in Electricity, Steam, Hot Water Production and Supply by Region.xls',
         "Energy Industry": '5Investment in Fixed Assets of State-Owned Units in Energy Industry by Region.xls',
         "Gas Production and Supply": '5Investment in Fixed Assets of State-Owned Units in Gas Production and Supply by Region.xls',
         "Petroleum and Natural Gas Extraction": '5Investment in Fixed Assets of State-Owned Units in Petroleum and Natural Gas Extraction by Region.xls',
         "Petroleum Processing and Coking": '5Investment in Fixed Assets of State-Owned Units in Petroleum Processing and Coking by Region.xls'}

types_dict = {"Coal Mining and Processing": '煤炭采选',
              "Electricity, Steam, Hot Water Production and Supply": '电力、蒸汽、热水生产和供应',
              "Energy Industry": '能源工',
              "Gas Production and Supply": '煤气生产和供应',
              "Petroleum and Natural Gas Extraction": '石油和天然气开采',
              "Petroleum Processing and Coking": '石油加工及炼焦'}

unit_en = '100 million yuan'
unit_cn = '亿元'

In [733]:
df_invesment_fixed_region_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_invesment_fixed_region_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    category_1_cn = types_dict[category_1_en]
    print(category_1_en)
    
    file_name = f'5Investment in Fixed Assets of State-Owned Units in {category_1_en} by Region.xls'
    
    df = pd.read_excel(f'../data/Energy data/5Energy Investmentƒ‹‘¥Õ∂◊ /{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='地  区')
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Energy Investment'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Investment in Fixed Assets of State-Owned Units by Region'
    df_en['Indicator'] = 'Investment'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Province'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_invesment_fixed_region_en = pd.concat([df_invesment_fixed_region_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Region')
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    df_cn.rename(columns={'地  区': 'Region'}, inplace=True)
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '能源投资'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '各地区国有单位固定资产投资'
    df_cn['Indicator'] = '投资'
    df_cn['Category 1'] = category_1_cn
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '省份'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_invesment_fixed_region_cn = pd.concat([df_invesment_fixed_region_cn, df_cn])

Coal Mining and Processing
Electricity, Steam, Hot Water Production and Supply
Energy Industry
Gas Production and Supply
Petroleum and Natural Gas Extraction
Petroleum Processing and Coking


In [734]:
df_invesment_fixed_region_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Investment,5Investment in Fixed Assets of State-Owned Uni...,Investment in Fixed Assets of State-Owned Unit...,Investment,Coal Mining and Processing,,,,Beijing,Province,1995,0.62,100 million yuan


In [735]:
df_invesment_fixed_region_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源投资,5Investment in Fixed Assets of State-Owned Uni...,各地区国有单位固定资产投资,投资,煤炭采选,,,,北 京,省份,1995,0.62,亿元


#### **Concatenate all datasets**

In [736]:
df_investment_en = pd.concat([df_invesment_en, df_invesment_fixed_en, df_invesment_region_en, df_invesment_fixed_region_en])
df_investment_en.reset_index(inplace=True)
df_investment_en.drop(columns='index', inplace=True)

df_investment_cn = pd.concat([df_invesment_cn, df_invesment_fixed_cn, df_invesment_region_cn, df_invesment_fixed_region_cn])
df_investment_cn.reset_index(inplace=True)
df_investment_cn.drop(columns='index', inplace=True)

**Save tables**

In [737]:
df_investment_en.to_csv('../data/Clean_data/Energy/5_Energy_Investment/Energy_Investment_en.csv')
df_investment_cn.to_csv('../data/Clean_data/Energy/5_Energy_Investment/Energy_Investment_cn.csv')

### 1.7 Electricity
**Datasets:**
- Power Generation 
    - Power Generation by Region
    - Thermal Power Generation by Region
    - Hydro Power Generation by Region
    - Thermal Power Generation by Region
  
#### 1.7.1 Power Generation
**Rearange tables**

In [145]:
types = {"All": '6Power Generation by Region.xls',
         "Thermal Power Generation": '6Thermal Power Generation by Region.xls',
         "Hydro Power Generation": '6Hydro Power Generation by Region.xls'}

types_dict = {"All": '全部',
              "Thermal Power Generation": '火力发电量',
              "Hydro Power Generation": '水力发电量'}

datasets_en = {"All": 'Power Generation by Region',
              "Thermal Power Generation": 'Thermal Power Generation by Region',
              "Hydro Power Generation": 'Hydro Power Generation by Region'}

datasets_cn = {"All": '分地区发电量',
              "Thermal Power Generation": '分地区火力发电量',
              "Hydro Power Generation": '分地区水力发电量'}

unit_en = '10e+8 kW•h'
unit_cn = '亿千瓦小时'

In [5]:
df_power_generation_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_power_generation_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]
    
    df = pd.read_excel(f'../data/Energy data/6ElectricityµÁ¡¶/{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns='地  区')
    
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Electricity'
    df_en['Source'] = file_name
    df_en['Dataset'] = datasets_en[category_1_en]
    df_en['Indicator'] = 'Power Generation'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Province'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
            
    df_power_generation_en = pd.concat([df_power_generation_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Region')
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    df_cn.rename(columns={'地  区': 'Region'}, inplace=True)
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '电'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = datasets_cn[category_1_en]
    df_cn['Indicator'] = '发电量'
    df_cn['Category 1'] = types_dict[category_1_en]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '省份'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
       
    df_power_generation_cn = pd.concat([df_power_generation_cn, df_cn])

All
Thermal Power Generation
Hydro Power Generation


In [60]:
types_columns = {"Nuclear Power Generation": ['核能发电量', 'Unnamed: 3', 'Unnamed: 4'],
                 "Wind Power Generation": ['风力发电量', 'Unnamed: 6', 'Unnamed: 7'],
                 "Solar Power Generation": ['太阳能发电量', 'Unnamed: 9', 'Unnamed: 10']}

types_dict = {"Nuclear Power Generation": '核能发电量',
              "Wind Power Generation": '风力发电量',
              "Solar Power Generation": '太阳能发电量'}

In [61]:
file_name = '6Nuclear,Wind,Solar Power Generation by Region.xls'

df = pd.read_excel(f'../data/Energy data/6ElectricityµÁ¡¶/{file_name}', skiprows=4)

df_other_generation_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_other_generation_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types_columns.keys():
    print(category_1_en)
    ### English
    df_en = df.drop(columns='地  区').copy()
    df_en = df_en[['Region']+types_columns[category_1_en]]
    df_en.rename(columns=dict(zip(types_columns[category_1_en], ['', '', ''])), inplace=True)
    df_en['Region'] = df_en['Region'].fillna('')
    df_en.columns = [x+ '' + y for x, y in zip(list(df_en.columns), list(df_en.iloc[1].astype(str)))]
    df_en.drop(index=[0,1], inplace=True)
    df_en.reset_index(inplace=True)
    df_en.drop(columns='index', inplace=True)
        
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    df_en['Year'] = df_en['Year'].astype(float).astype(int)
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Electricity'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Nuclear,Wind,Solar Power Generation by Region'
    df_en['Indicator'] = 'Power Generation'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Province'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
        
    df_other_generation_en = pd.concat([df_other_generation_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Region').copy()
    df_cn = df_cn[['地  区']+types_columns[category_1_en]]
    df_cn.rename(columns=dict(zip(types_columns[category_1_en], ['', '', ''])), inplace=True)
    df_cn.rename(columns={'地  区': 'Region'}, inplace=True)
    df_cn['Region'] = df_cn['Region'].fillna('')
    df_cn.columns = [x+ '' + y for x, y in zip(list(df_cn.columns), list(df_cn.iloc[1].astype(str)))]
    df_cn.drop(index=[0,1], inplace=True)
    df_cn.reset_index(inplace=True)
    df_cn.drop(columns='index', inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    df_cn['Year'] = df_cn['Year'].astype(float).astype(int)
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '电'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '分地区核能、风力、太阳能发电量'
    df_cn['Indicator'] = '发电量'
    df_cn['Category 1'] = types_dict[category_1_en]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '省份'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_other_generation_cn = pd.concat([df_other_generation_cn, df_cn])

Nuclear Power Generation
Wind Power Generation
Solar Power Generation


#### **Concatenate all datasets**

In [65]:
df_power_generation_en = pd.concat([df_power_generation_en, df_other_generation_en])
df_power_generation_cn = pd.concat([df_power_generation_cn, df_other_generation_cn])

In [68]:
df_power_generation_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Electricity,6Power Generation by Region.xls,Power Generation by Region,Power Generation,All,,,,Beijing,Province,1995,132.211,10e+8 kW•h


In [69]:
df_power_generation_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,电,6Power Generation by Region.xls,分地区发电量,发电量,全部,,,,北 京,省份,1995,132.211,亿千瓦小时


**Save tables**

In [70]:
df_power_generation_en.to_csv('../data/Clean_data/Energy/6_Electricity/Electricity_en.csv')
df_power_generation_cn.to_csv('../data/Clean_data/Energy/6_Electricity/Electricity_cn.csv')

### 1.8 International Comparison
**Datasets:**
- Alternating Current Power Consumption of Electrolytic Aluminium by country  
    - Alternating Current Power Consumption of Electrolytic Aluminium by country 
- Coal Production by Country  
    - Coal Production by Country
- Comparable Energy Consumption of Steel by country  
    - Comparable Energy Consumption of Steel by country
- Total energy consumption by commodity and country
    - Full Energy Consumption of Paper and Paperboard by country
    - Full Energy Consumption of Cement by country
    - Full Energy Consumption of Ethylene by country
    - Full Energy Consumption of Sythetic Ammonia by country
- Coal Consumption Rate for Fossil-Fired Power Plant by country
    - Gross Coal Consumption Rate for Fossil-Fired Power Plant by country
    - Net Coal Consumption Rate for Fossil-fired Power Plant by country
- Net Import by fuel type and country
    - Net Import of Coal by country
    - Net Import of Gas by country
    - Net Import of Oil by country
- Power Generation by Source and country
    - Power Generation by Source and country
- Primary Supply by fuel type and country
    - Primary Supply of Coal by country
    - Primary Supply of Gas by country
    - Primary Supply of Oil by country
- Production by fuel type and country
    - Production of Crude Oil, NGL and Additives by country
    - Production of Natural Gas by country
- Ratios by fuel type and country
    - Ratio of Electricity Consumption to GDP by country
    - Ratio of Electricity Consumption to Population by country
    - Ratio of TPES to GDP by country
    - Ratio of TPES to Population by country
    - Ration of Energy Production to TPES -self-sufficiency by country
- Total Electricity Generation by country
    - Total Electricity Generation by country
- Total Final Consumption of Energy by country
    - Total Final Consumption of Energy by country
- Total Production of Energy by country
    - Total Production of Energy by country
    
#### 1.8.1 Alternating Current Power Consumption of Electrolytic Aluminium by country 
**Rearange tables**

In [133]:
file_name = '9Alternating Current Power Consumption for Electrolytic Aluminium by country.xls'

df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

### English
df_alt_en = df.drop(columns='国家').copy()
df_alt_en.rename(columns={'Country': 'Region'}, inplace=True)
   
#Unpivot fuel type columns
df_alt_en = df_alt_en.melt(list(df_alt_en.columns)[0], var_name='Year', value_name='Value')

#Add additional columns
df_alt_en['Group'] = 'Energy'
df_alt_en['Sub-group'] = 'International Comparison'
df_alt_en['Source'] = file_name
df_alt_en['Dataset'] = 'Alternating Current Power Consumption for Electrolytic Aluminium'
df_alt_en['Indicator'] = 'Power Consumption'
df_alt_en['Category 1'] = ''
df_alt_en['Category 2'] = ''
df_alt_en['Category 3'] = ''
df_alt_en['Category 4'] = ''
df_alt_en['Unit'] = 'kW·h/t'
df_alt_en['Region Type'] = 'Country'
df_alt_en = df_alt_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_alt_cn = df.drop(columns='Country').copy()
df_alt_cn.rename(columns={'国家': 'Region'}, inplace=True)

#Unpivot fuel type columns
df_alt_cn = df_alt_cn.melt(list(df_alt_cn.columns)[0], var_name='Year', value_name='Value')

#Add additional columns
df_alt_cn['Group'] = '能源'
df_alt_cn['Sub-group'] = '国际比较'
df_alt_cn['Source'] = file_name
df_alt_cn['Dataset'] = '电解铝交流电耗'
df_alt_cn['Indicator'] = '能量消耗'
df_alt_cn['Category 1'] = ''
df_alt_cn['Category 2'] = ''
df_alt_cn['Category 3'] = ''
df_alt_cn['Category 4'] = ''
df_alt_cn['Unit'] = '千瓦时/吨'
df_alt_cn['Region Type'] = '国家'
df_alt_cn = df_alt_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [134]:
df_alt_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Alternating Current Power Consumption for Ele...,Alternating Current Power Consumption for Elec...,Power Consumption,,,,,China,Country,1990,17100.0,kW·h/t


In [135]:
df_alt_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Alternating Current Power Consumption for Ele...,电解铝交流电耗,能量消耗,,,,,中国,国家,1990,17100.0,千瓦时/吨


#### 1.8.2 Coal Production by Country
**Rearange tables**

In [239]:
file_name = '9Coal Production by Country.xls'

df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

### English
df_coal_en = df.drop(columns=['国家和地区', '比重']).copy()
df_coal_en.rename(columns={'Contury or Area': 'Region', 'Percent of World': 'Value'}, inplace=True)

#Split value/percentage
df_coal_value_en = df_coal_en.drop(columns=['Value']).copy()
df_coal_per_en = df_coal_en[['Region', 'Value']].copy()

#Unpivot values
df_coal_value_en = df_coal_value_en.melt(list(df_coal_value_en.columns)[0], var_name='Year', value_name='Value')

#Concatenate value/percentage
df_coal_value_en['Indicator'] = 'Coal Production'
df_coal_value_en['Unit'] = 'Mtoe'
df_coal_per_en['Indicator'] = 'Coal Production Proportion'
df_coal_per_en['Year'] = df_coal_value_en['Year'].unique().max()
df_coal_per_en['Unit'] = '%'
df_coal_en = pd.concat([df_coal_value_en, df_coal_per_en])

#Add additional columns
df_coal_en['Group'] = 'Energy'
df_coal_en['Sub-group'] = 'International Comparison'
df_coal_en['Source'] = file_name
df_coal_en['Dataset'] = 'Coal Production'
df_coal_en['Indicator'] = 'Coal Production'
df_coal_en['Category 1'] = ''
df_coal_en['Category 2'] = ''
df_coal_en['Category 3'] = ''
df_coal_en['Category 4'] = ''
df_coal_en['Region Type'] = 'Countries and regions'
df_coal_en = df_coal_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]


### Chinese
df_coal_cn = df.drop(columns=['Contury or Area', 'Percent of World']).copy()
df_coal_cn.rename(columns={'国家和地区': 'Region', '比重': 'Value'}, inplace=True)

#Split value/percentage
df_coal_value_cn = df_coal_cn.drop(columns=['Value']).copy()
df_coal_per_cn = df_coal_cn[['Region', 'Value']].copy()

#Unpivot values
df_coal_value_cn = df_coal_value_cn.melt(list(df_coal_value_cn.columns)[0], var_name='Year', value_name='Value')

#Concatenate value/percentage
df_coal_value_cn['Indicator'] = '煤炭生产'
df_coal_value_cn['Unit'] = '百万吨标准油'
df_coal_per_cn['Indicator'] = '煤炭生产比重'
df_coal_per_cn['Year'] = df_coal_value_cn['Year'].unique().max()
df_coal_per_cn['Unit'] = '%'
df_coal_cn = pd.concat([df_coal_value_cn, df_coal_per_cn])

#Add additional columns
df_coal_cn['Group'] = '能源'
df_coal_cn['Sub-group'] = '国际比较'
df_coal_cn['Source'] = file_name
df_coal_cn['Dataset'] = '煤生产量'
df_coal_cn['Indicator'] = '煤炭产量'
df_coal_cn['Category 1'] = ''
df_coal_cn['Category 2'] = ''
df_coal_cn['Category 3'] = ''
df_coal_cn['Category 4'] = ''
df_coal_cn['Region Type'] = '国家和地区'
df_coal_cn = df_coal_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [241]:
df_coal_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Coal Production by Country.xls,Coal Production,Coal Production,,,,,World,Countries and regions,1973,1474.0,Mtoe


In [242]:
df_coal_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Coal Production by Country.xls,煤生产量,煤炭产量,,,,,世界总计,国家和地区,1973,1474.0,百万吨标准油


#### 1.8.3 Comparable Energy Consumption of Steel by country 
**Rearange tables**

In [128]:
file_name = '9Comparable Energy Consumption for Steel by country.xls'

df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

### English
df_steel_en = df.drop(columns='国家').copy()
df_steel_en.rename(columns={'Country': 'Region'}, inplace=True)
   
#Unpivot fuel type columns
df_steel_en = df_steel_en.melt(list(df_steel_en.columns)[0], var_name='Year', value_name='Value')

#Add additional columns
df_steel_en['Group'] = 'Energy'
df_steel_en['Sub-group'] = 'International Comparison'
df_steel_en['Source'] = file_name
df_steel_en['Dataset'] = 'Comparable Energy Consumption for Steel'
df_steel_en['Indicator'] = 'Energy Consumption'
df_steel_en['Category 1'] = ''
df_steel_en['Category 2'] = ''
df_steel_en['Category 3'] = ''
df_steel_en['Category 4'] = ''
df_steel_en['Unit'] = 'kgce/tn'
df_steel_en['Region Type'] = 'Country'
df_steel_en = df_steel_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_steel_cn = df.drop(columns='Country').copy()
df_steel_cn.rename(columns={'国家': 'Region'}, inplace=True)

#Unpivot fuel type columns
df_steel_cn = df_steel_cn.melt(list(df_steel_cn.columns)[0], var_name='Year', value_name='Value')

#Add additional columns
df_steel_cn['Group'] = '能源'
df_steel_cn['Sub-group'] = '国际比较'
df_steel_cn['Source'] = file_name
df_steel_cn['Dataset'] = '钢可比能耗'
df_steel_cn['Indicator'] = '能源消耗'
df_steel_cn['Category 1'] = ''
df_steel_cn['Category 2'] = ''
df_steel_cn['Category 3'] = ''
df_steel_cn['Category 4'] = ''
df_steel_cn['Unit'] = '千克标准煤/吨'
df_steel_cn['Region Type'] = '国家'
df_steel_cn = df_steel_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [130]:
df_steel_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Comparable Energy Consumption for Steel by co...,Comparable Energy Consumption for Steel,Energy Consumption,,,,,China,Country,1990,997.0,kgce/tn


In [132]:
df_steel_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Comparable Energy Consumption for Steel by co...,钢可比能耗,能源消耗,,,,,中国,国家,1990,997.0,千克标准煤/吨


#### 1.8.4 Total energy consumption by commodity and country
**Rearange tables**

In [146]:
types = {"Cement": '9Full Energy Consumption for Cement by country.xls',
         "Ethylene": '9Full Energy Consumption for Ethylene by country.xls',
         "Paper and Paperboard": '9Full Energy Consumption for Paper and Paperboard by country.xls',
         "Sythetic Ammonia": '9Full Energy Consumption for Sythetic Ammonia by country.xls'}

types_dict = {"Cement": '水泥',
              "Ethylene": '乙烯',
              "Paper and Paperboard": '纸和纸板',
              "Sythetic Ammonia": '合成氨'}

unit_en = 'kgce/tn'
unit_cn = '千克标准煤/吨'

In [150]:
df_commodity_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_commodity_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

    ### English
    df_full_en = df.drop(columns='国家').copy()
    df_full_en.rename(columns={'Country': 'Region'}, inplace=True)
       
    #Unpivot fuel type columns
    df_full_en = df_full_en.melt(list(df_full_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_full_en['Group'] = 'Energy'
    df_full_en['Sub-group'] = 'International Comparison'
    df_full_en['Source'] = file_name
    df_full_en['Dataset'] = 'Total energy consumption by commodity'
    df_full_en['Indicator'] = 'Energy Consumption'
    df_full_en['Category 1'] = category_1_en
    df_full_en['Category 2'] = ''
    df_full_en['Category 3'] = ''
    df_full_en['Category 4'] = ''
    df_full_en['Unit'] = unit_en
    df_full_en['Region Type'] = 'Country'
    df_full_en = df_full_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_commodity_en = pd.concat([df_commodity_en, df_full_en])
    
    ### Chinese
    df_full_cn = df.drop(columns='Country').copy()
    df_full_cn.rename(columns={'国家': 'Region'}, inplace=True)
    
    #Unpivot fuel type columns
    df_full_cn = df_full_cn.melt(list(df_full_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_full_cn['Group'] = '能源'
    df_full_cn['Sub-group'] = '国际比较'
    df_full_cn['Source'] = file_name
    df_full_cn['Dataset'] = '商品总能源消耗'
    df_full_cn['Indicator'] = '能源消耗'
    df_full_cn['Category 1'] = types_dict[category_1_en]
    df_full_cn['Category 2'] = ''
    df_full_cn['Category 3'] = ''
    df_full_cn['Category 4'] = ''
    df_full_cn['Unit'] = unit_cn
    df_full_cn['Region Type'] = '国家'
    df_full_cn = df_full_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_commodity_cn = pd.concat([df_commodity_cn, df_full_cn])

Cement
Ethylene
Paper and Paperboard
Sythetic Ammonia


In [153]:
df_commodity_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Full Energy Consumption for Cement by country...,Total energy consumption by commodity,Energy Consumption,Cement,,,,China,Country,1990,201.0,kgce/tn


In [154]:
df_commodity_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Full Energy Consumption for Cement by country...,商品总能源消耗,能源消耗,水泥,,,,中国,国家,1990,201.0,千克标准煤/吨


#### 1.8.5 Coal Consumption Rate for Fossil-Fired Power Plant by country
**Rearange tables**

In [155]:
types = {"Gross": '9Gross Coal Consumption Rate for Fossil-Fired Power Plant by country.xls',
         "Net": '9Net Coal Consumption Rate for Fossil-fired Power Plant by country.xls'}

types_dict = {"Gross": '总',
              "Net": '净'}

unit_en = 'gce/kW·h'
unit_cn = '克标准煤/千瓦小时'

In [156]:
df_fossil_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_fossil_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

    ### English
    df_en = df.drop(columns='国家').copy()
    df_en.rename(columns={'Country': 'Region'}, inplace=True)
       
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Total energy consumption by commodity'
    df_en['Indicator'] = 'Energy Consumption'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Country'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_fossil_en = pd.concat([df_fossil_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Country').copy()
    df_cn.rename(columns={'国家': 'Region'}, inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '商品总能源消耗'
    df_cn['Indicator'] = '能源消耗'
    df_cn['Category 1'] = types_dict[category_1_en]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '国家'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_fossil_cn = pd.concat([df_fossil_cn, df_cn])

Gross
Net


In [158]:
df_fossil_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Gross Coal Consumption Rate for Fossil-Fired ...,Total energy consumption by commodity,Energy Consumption,Gross,,,,China,Country,1990,392.0,gce/kW·h


In [159]:
df_fossil_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Gross Coal Consumption Rate for Fossil-Fired ...,商品总能源消耗,能源消耗,总,,,,中国,国家,1990,392.0,克标准煤/千瓦小时


#### 1.8.6 Net Import by fuel type and country
**Rearange tables**

In [161]:
types = {"Coal": '9Net Import of Coal by Country.xls',
         "Gas": '9Net Import of Gas by country.xls',
         "Oil": '9Net Import of Oil by country.xls'}

types_dict = {"Coal": '煤炭',
              "Gas": '气体',
              "Oil": '油'}

unit_en = 'Mtoe'
unit_cn = '百万吨标准油'

In [166]:
df_net_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_net_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

    ### English
    df_en = df.drop(columns='国家和地区').copy()
    df_en.rename(columns={'Contury or Area': 'Region'}, inplace=True)
       
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Net Imports by Fuel Type'
    df_en['Indicator'] = 'Net Import'
    df_en['Category 1'] = category_1_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = unit_en
    df_en['Region Type'] = 'Countries and regions'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_net_en= pd.concat([df_net_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Contury or Area').copy()
    df_cn.rename(columns={'国家和地区': 'Region'}, inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '净进口量燃料种类'
    df_cn['Indicator'] = '净进口'
    df_cn['Category 1'] = types_dict[category_1_en]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = unit_cn
    df_cn['Region Type'] = '国家和地区'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_net_cn = pd.concat([df_net_cn, df_cn])

Coal
Gas
Oil


In [171]:
df_net_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Net Import of Coal by Country.xls,Net Imports by Fuel Type,Net Import,Coal,,,,China,Countries and regions,1973,-2.11,Mtoe


In [170]:
df_net_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Net Import of Coal by Country.xls,净进口量燃料种类,净进口,煤炭,,,,中国,国家和地区,1973,-2.11,百万吨标准油


#### 1.8.7 Power Generation by Source and country
**Rearange tables**

In [221]:
file_name = '9Power Generation by Source by country.xls'

df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)
    
### English
df_source_en = df.drop(columns='国家').copy()
df_source_en.rename(columns={'Country': 'Region'}, inplace=True)
df_source_en['Region'] = df_source_en['Region'].fillna('')
df_source_en.columns = [x+ '' + y for x, y in zip(list(df_source_en.columns), list(df_source_en.iloc[0]))]
df_source_en.columns = [df_source_en.columns[0]]+[x.split('(')[1].split(')')[0] for x in list(df_source_en.columns[1:])]
df_source_en.drop(index=[0], inplace=True)
df_source_en.reset_index(inplace=True)
df_source_en.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_source_en = df_source_en.melt(list(df_source_en.columns)[0], var_name='Category 1', value_name='Value')

#Add additional columns
df_source_en['Group'] = 'Energy'
df_source_en['Sub-group'] = 'International Comparison'
df_source_en['Source'] = file_name
df_source_en['Dataset'] = 'Power Generation by Source'
df_source_en['Indicator'] = 'Power Generation'
df_source_en['Category 2'] = ''
df_source_en['Category 3'] = ''
df_source_en['Category 4'] = ''
df_source_en['Unit'] = '%'
df_source_en['Year'] = 2017
df_source_en['Region Type'] = 'Country'
df_source_en = df_source_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
### Chinese
df_source_cn = df.drop(columns='Country').copy()
df_source_cn.rename(columns={'国家': 'Region'}, inplace=True)
df_source_cn.drop(index=[0], inplace=True)
df_source_cn.reset_index(inplace=True)
df_source_cn.drop(columns='index', inplace=True)

#Unpivot fuel type columns
df_source_cn = df_source_cn.melt(list(df_source_cn.columns)[0], var_name='Category 1', value_name='Value')

#Add additional columns
df_source_cn['Group'] = '能源'
df_source_cn['Sub-group'] = '国际比较'
df_source_cn['Source'] = file_name
df_source_cn['Dataset'] = '发电量'
df_source_cn['Indicator'] = '发电'
df_source_cn['Category 2'] = ''
df_source_cn['Category 3'] = ''
df_source_cn['Category 4'] = ''
df_source_cn['Unit'] = '%'
df_source_cn['Year'] = 2017
df_source_cn['Region Type'] = '国家'
df_source_cn = df_source_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [224]:
df_source_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Power Generation by Source by country.xls,Power Generation by Source,Power Generation,Petroleum,,,,China,Country,2017,0.2,%


In [225]:
df_source_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Power Generation by Source by country.xls,发电量,发电,石油,,,,中国,国家,2017,0.2,%


#### 1.8.8 Primary Supply by fuel type and country
**Rearange tables**

In [245]:
types = {"Coal": '9Primary Supply of Coal by country.xls',
         "Gas": '9Primary Supply of Gas by country.xls',
         "Oil": '9Primary Supply of Oil by country.xls'}

types_dict = {"Coal": '煤炭',
              "Gas": '气体',
              "Oil": '油'}

In [255]:
df_supply_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_supply_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns=['国家和地区', '比重']).copy()
    df_en.rename(columns={'Contury or Area': 'Region', 'Percent of World': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_en = df_en.drop(columns=['Value']).copy()
    df_per_en = df_en[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_en = df_value_en.melt(list(df_value_en.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_en['Indicator'] = 'Coal Production'
    df_value_en['Unit'] = 'Mtoe'
    df_per_en['Indicator'] = 'Coal Production Proportion'
    df_per_en['Year'] = df_value_en['Year'].unique().max()
    df_per_en['Unit'] = '%'
    df_en = pd.concat([df_value_en, df_per_en])
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Primary Supply by Fuel Type'
    df_en['Indicator'] = 'Primary Supply'
    df_en['Category 1'] = ''
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Region Type'] = 'Countries and regions'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_supply_en= pd.concat([df_supply_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns=['Contury or Area', 'Percent of World']).copy()
    df_cn.rename(columns={'国家和地区': 'Region', '比重': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_cn = df_cn.drop(columns=['Value']).copy()
    df_per_cn = df_cn[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_cn = df_value_cn.melt(list(df_value_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_cn['Indicator'] = '煤炭生产'
    df_value_cn['Unit'] = '百万吨标准油'
    df_per_cn['Indicator'] = '煤炭生产比重'
    df_per_cn['Year'] = df_value_cn['Year'].unique().max()
    df_per_cn['Unit'] = '%'
    df_cn = pd.concat([df_value_cn, df_per_cn])
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '按燃料类型的主要供应'
    df_cn['Indicator'] = '主要供应'
    df_cn['Category 1'] = ''
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Region Type'] = '国家和地区'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_supply_cn= pd.concat([df_supply_cn, df_cn])

Coal
Gas
Oil


In [258]:
df_supply_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Primary Supply of Coal by country.xls,Primary Supply by Fuel Type,Primary Supply,,,,,World,Countries and regions,1973,1496.2,Mtoe


In [259]:
df_supply_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Primary Supply of Coal by country.xls,按燃料类型的主要供应,主要供应,,,,,世界,国家和地区,1973,1496.2,百万吨标准油


#### 1.8.9 Production by fuel type and country
**Rearange tables**

In [260]:
types = {"Crude Oil, NGL and Additives": '9Production of Crude Oil, NGL and Additives by country.xls',
         "Natural Gas": '9Production of Natural Gas by country.xls'}

types_dict = {"Crude Oil, NGL and Additives": '原油和天然气凝析液',
              "Natural Gas": '天然气'}

In [261]:
df_production_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_production_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for category_1_en in types.keys():
    print(category_1_en)
    
    file_name = types[category_1_en]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns=['国家和地区', '比重']).copy()
    df_en.rename(columns={'Contury or Area': 'Region', 'Percent of World': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_en = df_en.drop(columns=['Value']).copy()
    df_per_en = df_en[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_en = df_value_en.melt(list(df_value_en.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_en['Indicator'] = 'Coal Production'
    df_value_en['Unit'] = 'Mtoe'
    df_per_en['Indicator'] = 'Coal Production Proportion'
    df_per_en['Year'] = df_value_en['Year'].unique().max()
    df_per_en['Unit'] = '%'
    df_en = pd.concat([df_value_en, df_per_en])
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Production by Fuel Type'
    df_en['Indicator'] = 'Production'
    df_en['Category 1'] = ''
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Region Type'] = 'Countries and regions'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_production_en = pd.concat([df_production_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns=['Contury or Area', 'Percent of World']).copy()
    df_cn.rename(columns={'国家和地区': 'Region', '比重': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_cn = df_cn.drop(columns=['Value']).copy()
    df_per_cn = df_cn[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_cn = df_value_cn.melt(list(df_value_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_cn['Indicator'] = '煤炭生产'
    df_value_cn['Unit'] = '百万吨标准油'
    df_per_cn['Indicator'] = '煤炭生产比重'
    df_per_cn['Year'] = df_value_cn['Year'].unique().max()
    df_per_cn['Unit'] = '%'
    df_cn = pd.concat([df_value_cn, df_per_cn])
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '按燃料类型生产'
    df_cn['Indicator'] = '生产'
    df_cn['Category 1'] = ''
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Region Type'] = '国家和地区'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_production_cn= pd.concat([df_production_cn, df_cn])

Crude Oil, NGL and Additives
Natural Gas


In [266]:
df_production_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,"9Production of Crude Oil, NGL and Additives by...",Production by Fuel Type,Production,,,,,World,Countries and regions,1973,2938.39,Mtoe


In [267]:
df_production_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,"9Production of Crude Oil, NGL and Additives by...",按燃料类型生产,生产,,,,,世界总计,国家和地区,1973,2938.39,百万吨标准油


#### 1.8.10 Ratios
**Rearange tables**

In [272]:
datasets = {"Electricity Consumption/GDP (2010 US$)": '9Ratio of Electricity Consumption to GDP by country.xls',
            "Electricity Consumption/Population": '9Ratio of Electricity Consumption to Population by country.xls',
            "TPES/GDP(2010 US$)": '9Ratio of TPES to GDP by country.xls',
            "TPES/Population": '9Ratio of TPES to Population by country.xls',
            "Energy Production/TPES (self-sufficiency)": '9Ration of Energy Production to TPES -self-sufficiency by country.xls'}

datasets_dict = {"Electricity Consumption/GDP (2010 US$)": '国内生产总值电耗(2010年价)',
                 "Electricity Consumption/Population": '人均电力消费量',
                 "TPES/GDP(2010 US$)": '能源供应量/GDP(2010年价格)',
                 "TPES/Population": '人均能源供应量',
                 "Energy Production/TPES (self-sufficiency)": '能源生产量/一次能源供应量(能源自给率)'}

indicators_en = {"Electricity Consumption/GDP (2010 US$)": 'Electricity Consumption/GDP (2010 US$)',
            "Electricity Consumption/Population": 'Electricity Consumption/Population',
            "TPES/GDP(2010 US$)": 'TPES/GDP(2010 US$)',
            "TPES/Population": 'TPES/Population',
            "Energy Production/TPES (self-sufficiency)": 'Energy Production/TPES (self-sufficiency)'}

indicators_cn = {"Electricity Consumption/GDP (2010 US$)": '国内生产总值电耗(2010年价)',
            "Electricity Consumption/Population": '人均电力消费量',
            "TPES/GDP(2010 US$)": '能源供应量/GDP(2010年价格)',
            "TPES/Population": '人均能源供应量',
            "Energy Production/TPES (self-sufficiency)": '能源生产量/一次能源供应量(能源自给率)'}

units_en = {"Electricity Consumption/GDP (2010 US$)": 'kW•h per US$',
            "Electricity Consumption/Population": 'kW•h per capita',
            "TPES/GDP(2010 US$)": 'toe per thousand US$',
            "TPES/Population": 'toe per capita',
            "Energy Production/TPES (self-sufficiency)": ''}

units_cn = {"Electricity Consumption/GDP (2010 US$)": '千瓦小时/美元',
            "Electricity Consumption/Population": '千瓦小时/人',
            "TPES/GDP(2010 US$)": '吨标准油/千美元',
            "TPES/Population": '吨标准油/人',
            "Energy Production/TPES (self-sufficiency)": ''}

In [280]:
df_rations_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_rations_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for dataset in datasets.keys():
    print(dataset)
    
    file_name = datasets[dataset]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)

    ### English
    df_en = df.drop(columns='国家和地区').copy()
    df_en.rename(columns={'Contury or Area': 'Region'}, inplace=True)
       
    #Unpivot fuel type columns
    df_en = df_en.melt(list(df_en.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = dataset
    df_en['Indicator'] = indicators_en[dataset]
    df_en['Category 1'] = ''
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Unit'] = units_en[dataset]
    df_en['Region Type'] = 'Countries and regions'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_rations_en= pd.concat([df_rations_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns='Contury or Area').copy()
    df_cn.rename(columns={'国家和地区': 'Region'}, inplace=True)
    
    #Unpivot fuel type columns
    df_cn = df_cn.melt(list(df_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = datasets_dict[dataset]
    df_cn['Indicator'] = indicators_cn[dataset]
    df_cn['Category 1'] = ''
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Unit'] = units_cn[dataset]
    df_cn['Region Type'] = '国家和地区'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_rations_cn = pd.concat([df_rations_cn, df_cn])

Electricity Consumption/GDP (2010 US$)
Electricity Consumption/Population
TPES/GDP(2010 US$)
TPES/Population
Energy Production/TPES (self-sufficiency)


In [281]:
df_rations_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Ratio of Electricity Consumption to GDP by co...,Electricity Consumption/GDP (2010 US$),Electricity Consumption/GDP (2010 US$),,,,,World,Countries and regions,1973,0.248,kW•h per US$


In [282]:
df_rations_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Ratio of Electricity Consumption to GDP by co...,国内生产总值电耗(2010年价),国内生产总值电耗(2010年价),,,,,世界,国家和地区,1973,0.248,千瓦小时/美元


#### 1.8.11 Totals
**Rearange tables**

In [287]:
datasets = {"Total Electricity Generation": '9Total Electricity Generation by country.xls',
            "Total Final Consumption of Energy": '9Total Final Consumption of Energy by country.xls',
            "Total Production of Energy": '9Total Production of Energy by country.xls'}

datasets_dict = {"Total Electricity Generation": '总发电量',
            "Total Final Consumption of Energy": '终端能源消费量',
            "Total Production of Energy": '能源生产总量'}

units_en = {"Total Electricity Generation": 'GWh',
            "Total Final Consumption of Energy": 'Mtoe',
            "Total Production of Energy": 'Mtoe'}

units_cn = {"Total Electricity Generation": '百万千瓦小时',
            "Total Final Consumption of Energy": '百万吨标准油',
            "Total Production of Energy": '百万吨标准油'}

In [288]:
df_totals_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_totals_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for dataset in datasets.keys():
    print(dataset)
    
    file_name = datasets[dataset]

    df = pd.read_excel(f'../data/Energy data/9Internantional Comparisonπ˙º ±»Ωœ/{file_name}', skiprows=4)
    
    ### English
    df_en = df.drop(columns=['国家和地区', '比重']).copy()
    df_en.rename(columns={'Contury or Area': 'Region', 'Percent of World': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_en = df_en.drop(columns=['Value']).copy()
    df_per_en = df_en[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_en = df_value_en.melt(list(df_value_en.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_en['Indicator'] = dataset
    df_value_en['Unit'] = units_en[dataset]
    df_per_en['Indicator'] = dataset+' Proportion'
    df_per_en['Year'] = df_value_en['Year'].unique().max()
    df_per_en['Unit'] = '%'
    df_en = pd.concat([df_value_en, df_per_en])
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'International Comparison'
    df_en['Source'] = file_name
    df_en['Dataset'] = dataset
    df_en['Category 1'] = ''
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Region Type'] = 'Countries and regions'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_totals_en = pd.concat([df_totals_en, df_en])
    
    ### Chinese
    df_cn = df.drop(columns=['Contury or Area', 'Percent of World']).copy()
    df_cn.rename(columns={'国家和地区': 'Region', '比重': 'Value'}, inplace=True)
    
    #Split value/percentage
    df_value_cn = df_cn.drop(columns=['Value']).copy()
    df_per_cn = df_cn[['Region', 'Value']].copy()
    
    #Unpivot values
    df_value_cn = df_value_cn.melt(list(df_value_cn.columns)[0], var_name='Year', value_name='Value')
    
    #Concatenate value/percentage
    df_value_cn['Indicator'] = datasets_dict[dataset]
    df_value_cn['Unit'] = units_cn[dataset]
    df_per_cn['Indicator'] = datasets_dict[dataset]+'比重'
    df_per_cn['Year'] = df_value_cn['Year'].unique().max()
    df_per_cn['Unit'] = '%'
    df_cn = pd.concat([df_value_cn, df_per_cn])
    
    #Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '国际比较'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = datasets_dict[dataset]
    df_cn['Category 1'] = ''
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Region Type'] = '国家和地区'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_totals_cn= pd.concat([df_totals_cn, df_cn])

Total Electricity Generation
Total Final Consumption of Energy
Total Production of Energy


In [291]:
df_totals_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Total Electricity Generation by country.xls,Total Electricity Generation,Total Electricity Generation,,,,,World,Countries and regions,1973,6131143.0,GWh


In [294]:
df_totals_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Total Electricity Generation by country.xls,总发电量,总发电量,,,,,世界总计,国家和地区,1973,6131143.0,百万千瓦小时


#### **Concatenate all datasets**

In [295]:
df_internantional_comparison_en = pd.concat([df_alt_en, df_coal_en, df_steel_en, df_commodity_en, df_fossil_en, df_net_en, df_source_en, df_supply_en, df_production_en, df_rations_en, df_totals_en])
df_internantional_comparison_cn = pd.concat([df_alt_cn, df_coal_cn, df_steel_cn, df_commodity_cn, df_fossil_cn, df_net_cn, df_source_cn, df_supply_cn, df_production_cn, df_rations_cn, df_totals_cn])

In [299]:
df_internantional_comparison_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,International Comparison,9Alternating Current Power Consumption for Ele...,Alternating Current Power Consumption for Elec...,Power Consumption,,,,,China,Country,1990,17100,kW·h/t


In [296]:
df_internantional_comparison_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,国际比较,9Alternating Current Power Consumption for Ele...,电解铝交流电耗,能量消耗,,,,,中国,国家,1990,17100,千瓦时/吨


**Save tables**

In [300]:
df_internantional_comparison_en.to_csv('../data/Clean_data/Energy/9_Internantional_Comparison/Internantional_Comparison_en.csv')
df_internantional_comparison_cn.to_csv('../data/Clean_data/Energy/9_Internantional_Comparison/Internantional_Comparison_cn.csv')

### 1.9 Emissions, Natural Disaters, and Air Quality
**Datasets:**
- Main Pollutant Emission in Waste Gas by Region  
    - Main Pollutant Emission in Waste Gas by Region
- Forest fires  
    - Forest fires
- Geological Diasters and Prevention and Cure by year and by region
    - Geological Diasters and Prevention and Cure by year and by region
- Loss Caused by Natural Disasters by region
    - Loss Caused by Natural Disasters by region
    
#### 1.9.1 Main Pollutant Emission in Waste Gas by Region
**Rearange tables**

In [510]:
file_name = "10Main Pollutant Emission in Waste Gas by Region (2017).xls"

df = pd.read_excel(f'../data/Energy data/10Emissions&Natural Disaters&Air Quality≈≈∑≈◊‘»ª‘÷∫¶“‘º∞ø’∆¯÷ ¡øµ»/{file_name}', skiprows=4)

### English
df_emission_en = df.drop(columns=['地区']).copy()
df_emission_en.columns = list(df_emission_en.iloc[0])
df_emission_en.drop(index=[0], inplace=True)
df_emission_en.reset_index(inplace=True)
df_emission_en.drop(columns='index', inplace=True)
df_emission_en.loc[0][0] = 'China'
df_emission_en['Region Type'] = ['Country']+['Province']*(len(df_emission_en)-1)
df_emission_en = df_emission_en[[list(df_emission_en.columns)[-1]]+list(df_emission_en.columns)[:-1]]

#Unpivot values
df_emission_en = df_emission_en.melt(list(df_emission_en.columns)[:2], var_name='Category 1', value_name='Value')

#Add additional columns
df_emission_en['Group'] = 'Energy'
df_emission_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_emission_en['Source'] = file_name
df_emission_en['Dataset'] = 'Main Pollutant Emission in Waste Gas by Region (2017)'
df_emission_en['Indicator'] = 'Emissions'
df_emission_en['Category 2'] = ''
df_emission_en['Category 3'] = ''
df_emission_en['Category 4'] = ''
df_emission_en['Year'] = 2017
df_emission_en['Unit'] = '10e+4 tons'
df_emission_en = df_emission_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_emission_cn = df.drop(columns=['Unnamed: 1']).copy()
df_emission_cn.drop(index=[0], inplace=True)
df_emission_cn.reset_index(inplace=True)
df_emission_cn.drop(columns='index', inplace=True)
df_emission_cn.rename(columns={'地区': 'Region'}, inplace=True)
df_emission_cn.loc[0][0] = '中国'
df_emission_cn['Region Type'] = ['国家']+['省份']*(len(df_emission_cn)-1)
df_emission_cn = df_emission_cn[[list(df_emission_cn.columns)[-1]]+list(df_emission_cn.columns)[:-1]]

#Unpivot values
df_emission_cn = df_emission_cn.melt(list(df_emission_cn.columns)[:2], var_name='Category 1', value_name='Value')

#Add additional columns
df_emission_cn['Group'] = '能源'
df_emission_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_emission_cn['Source'] = file_name
df_emission_cn['Dataset'] = '分地区废气中主要污染物排放情况 (2017年)'
df_emission_cn['Indicator'] = '排放物'
df_emission_cn['Category 2'] = ''
df_emission_cn['Category 3'] = ''
df_emission_cn['Category 4'] = ''
df_emission_cn['Year'] = 2017
df_emission_cn['Unit'] = '万吨'
df_emission_cn = df_emission_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [513]:
df_emission_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",10Main Pollutant Emission in Waste Gas by Regi...,Main Pollutant Emission in Waste Gas by Region...,Emissions,Sulphur Dioxide,,,,China,Country,2017,875.398,10e+4 tons


In [514]:
df_emission_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,10Main Pollutant Emission in Waste Gas by Regi...,分地区废气中主要污染物排放情况 (2017年),排放物,二氧化硫,,,,中国,国家,2017,875.398,万吨


#### 1.9.2 Forest fires
**Rearange tables**

In [544]:
file_name = "Forest fires.xls"

df = pd.read_excel(f'../data/Energy data/10Emissions&Natural Disaters&Air Quality≈≈∑≈◊‘»ª‘÷∫¶“‘º∞ø’∆¯÷ ¡øµ»/{file_name}', skiprows=2)

### English
df_forest_en = df.drop(columns=['地区']).copy()
df_forest_en['Region'] = df_forest_en['Region'].fillna('')
df_forest_en.iloc[1] = df_forest_en.iloc[1].fillna('')

#Split dataset
df_fires_en = df_forest_en.drop(columns=['森林火灾次数', '火场总面积', '受害森林面积', '伤亡人数', '其他损失折款']).copy()
df_forest_en = df_forest_en[['Region', '森林火灾次数', '火场总面积', '受害森林面积', '伤亡人数', '其他损失折款']].copy()

df_fires_en.columns = [x+ '' + y for x, y in zip(list(df_fires_en.iloc[1]), list(df_fires_en.iloc[2]))]
df_fires_en.rename(columns={'': 'Region'}, inplace=True)
df_fires_en.drop(index=[0,1,2], inplace=True)
df_fires_en.reset_index(inplace=True)
df_fires_en.drop(columns='index', inplace=True)
df_fires_en.loc[0][0] = 'China'
df_fires_en['Region Type'] = ['Country']+['Province']*(len(df_fires_en)-1)
df_fires_en = df_fires_en[[list(df_fires_en.columns)[-1]]+list(df_fires_en.columns)[:-1]]

df_forest_en.columns = [x+ '' + y for x, y in zip(list(df_forest_en.iloc[1]), list(df_forest_en.iloc[2]))]
df_forest_en.rename(columns={'': 'Region'}, inplace=True)
df_forest_en.drop(index=[0,1,2], inplace=True)
df_forest_en.reset_index(inplace=True)
df_forest_en.drop(columns='index', inplace=True)
df_forest_en.loc[0][0] = 'China'
df_forest_en['Region Type'] = ['Country']+['Province']*(len(df_forest_en)-1)
df_forest_en = df_forest_en[[list(df_forest_en.columns)[-1]]+list(df_forest_en.columns)[:-1]]

#Unpivot values
df_fires_en = df_fires_en.melt(list(df_fires_en.columns)[:2], var_name='Category 1', value_name='Value')
df_fires_en['Indicator'] = 'Forest fires'
df_fires_en['Unit'] = 'times'

df_forest_en = df_forest_en.melt(list(df_forest_en.columns)[:2], var_name='Indicator', value_name='Value')
df_forest_en['Unit'] = df_forest_en['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_forest_en['Indicator'] = df_forest_en['Indicator'].apply(lambda x: x.split('(')[0])
df_forest_en['Category 1'] = ''

#Concatenate datasets
df_forest_en = pd.concat([df_forest_en[df_forest_en['Indicator'] == 'Forest fires'], 
                          df_fires_en, 
                          df_forest_en[df_forest_en['Indicator'] != 'Forest fires']])
df_forest_en.reset_index(inplace=True)
df_forest_en.drop(columns='index', inplace=True)

#Add additional columns
df_forest_en['Group'] = 'Energy'
df_forest_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_forest_en['Source'] = file_name
df_forest_en['Dataset'] = 'Forest fires in 2019'
df_forest_en['Category 2'] = ''
df_forest_en['Category 3'] = ''
df_forest_en['Category 4'] = ''
df_forest_en['Year'] = 2019
df_forest_en = df_forest_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_forest_cn = df.drop(columns=['Region']).copy()
df_forest_cn.rename(columns={'地区': 'Region'}, inplace=True)
df_forest_cn['Region'] = df_forest_cn['Region'].fillna('')
df_forest_cn.iloc[1] = df_forest_cn.iloc[1].fillna('')

#Split dataset
df_fires_cn = df_forest_cn.drop(columns=['森林火灾次数', '火场总面积', '受害森林面积', '伤亡人数', '其他损失折款']).copy()
df_forest_cn = df_forest_cn[['Region', '森林火灾次数', '火场总面积', '受害森林面积', '伤亡人数', '其他损失折款']].copy()

df_fires_cn.columns = df_fires_cn.iloc[0]
df_fires_cn.rename(columns={'': 'Region'}, inplace=True)
df_fires_cn.drop(index=[0,1,2], inplace=True)
df_fires_cn.reset_index(inplace=True)
df_fires_cn.drop(columns='index', inplace=True)
df_fires_cn.loc[0][0] = '中国'
df_fires_cn['Region Type'] = ['国家']+['省份']*(len(df_fires_cn)-1)
df_fires_cn = df_fires_cn[[list(df_fires_cn.columns)[-1]]+list(df_fires_cn.columns)[:-1]]

df_forest_cn.columns = [x+ '' + y for x, y in zip(list(df_forest_cn.columns), list(df_forest_cn.iloc[0]))]
df_forest_cn.drop(index=[0,1,2], inplace=True)
df_forest_cn.reset_index(inplace=True)
df_forest_cn.drop(columns='index', inplace=True)
df_forest_cn.loc[0][0] = '中国'
df_forest_cn['Region Type'] = ['国家']+['省份']*(len(df_forest_cn)-1)
df_forest_cn = df_forest_cn[[list(df_forest_cn.columns)[-1]]+list(df_forest_cn.columns)[:-1]]

#Unpivot values
df_fires_cn = df_fires_cn.melt(list(df_fires_cn.columns)[:2], var_name='Category 1', value_name='Value')
df_fires_cn['Indicator'] = '森林火灾次数'
df_fires_cn['Unit'] = '次'

df_forest_cn = df_forest_cn.melt(list(df_forest_cn.columns)[:2], var_name='Indicator', value_name='Value')
df_forest_cn['Unit'] = df_forest_cn['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_forest_cn['Indicator'] = df_forest_cn['Indicator'].apply(lambda x: x.split('(')[0])
df_forest_cn['Category 1'] = ''

#Concatenate datasets
df_forest_cn = pd.concat([df_forest_cn[df_forest_cn['Indicator'] == '森林火灾次数'], 
                          df_fires_cn, 
                          df_forest_cn[df_forest_cn['Indicator'] != '森林火灾次数']])
df_forest_cn.reset_index(inplace=True)
df_forest_cn.drop(columns='index', inplace=True)

#Add additional columns
df_forest_cn['Group'] = '能源'
df_forest_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_forest_cn['Source'] = file_name
df_forest_cn['Dataset'] = '森林火灾情况(2019年)'
df_forest_cn['Category 2'] = ''
df_forest_cn['Category 3'] = ''
df_forest_cn['Category 4'] = ''
df_forest_cn['Year'] = 2019
df_forest_cn = df_forest_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [547]:
df_forest_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",Forest fires.xls,Forest fires in 2019,Forest fires,,,,,China,Country,2019,2345,times


In [548]:
df_forest_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,Forest fires.xls,森林火灾情况(2019年),森林火灾次数,,,,,中国,国家,2019,2345,次


#### 1.9.3 Geological Diasters and Prevention and Cure by year and by region
**Rearange tables**

In [606]:
file_name = "Geological Diasters and Prevention and Cure by year and by region.xls"

df = pd.read_excel(f'../data/Energy data/10Emissions&Natural Disaters&Air Quality≈≈∑≈◊‘»ª‘÷∫¶“‘º∞ø’∆¯÷ ¡øµ»/{file_name}', skiprows=2)

df_year = df.iloc[:19].copy()
df_region = pd.concat([df.iloc[:3], df.iloc[19:]])

In [660]:
def add_indicator_en(x):
    if x == 'Deths':
        return 'Casualties'
    else:
        return 'Geological Disasters'
    
def add_indicator_cn(x):
    if x == '死亡人数':
        return '人员伤亡'
    else:
        return '发生地质 灾害数量'
    
def add_unit_en(x):
    if x == 'Deths':
        return 'person'
    else:
        return 'unit'
    
def add_unit_cn(x):
    if x == '死亡人数':
        return '人'
    else:
        return '处'

In [723]:
### English
df_year_en = df_year.drop(columns=['年份']).copy()
df_year_en['Year'] = df_year_en['Year'].fillna('')
df_year_en.iloc[1] = df_year_en.iloc[1].fillna('')

#Split dataset
df_year_cat_en = df_year_en.drop(columns=['发生地质 灾害数量', '人员伤亡', '直接经济损失']).copy()
df_year_in_en = df_year_en[['Year', '发生地质 灾害数量', '人员伤亡', '直接经济损失']].copy()

df_year_cat_en.columns = [x+ '' + y for x, y in zip(list(df_year_cat_en.iloc[1]), list(df_year_cat_en.iloc[2]))]
df_year_cat_en.rename(columns={'': 'Year'}, inplace=True)
df_year_cat_en.drop(index=[0,1,2], inplace=True)
df_year_cat_en.reset_index(inplace=True)
df_year_cat_en.drop(columns='index', inplace=True)

df_year_in_en.columns = [x+ '' + y for x, y in zip(list(df_year_in_en.iloc[1]), list(df_year_in_en.iloc[2]))]
df_year_in_en.rename(columns={'': 'Year'}, inplace=True)
df_year_in_en.drop(index=[0,1,2], inplace=True)
df_year_in_en.reset_index(inplace=True)
df_year_in_en.drop(columns='index', inplace=True)

#Unpivot values
df_year_cat_en = df_year_cat_en.melt(list(df_year_cat_en.columns)[0], var_name='Category 1', value_name='Value')
df_year_cat_en['Indicator'] = df_year_cat_en['Category 1'].apply(lambda x: add_indicator_en(x))
df_year_cat_en['Unit'] = df_year_cat_en['Category 1'].apply(lambda x: add_unit_en(x))

df_year_in_en = df_year_in_en.melt(list(df_year_in_en.columns)[0], var_name='Indicator', value_name='Value')
df_year_in_en['Unit'] = df_year_in_en['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_year_in_en['Indicator'] = df_year_in_en['Indicator'].apply(lambda x: x.split('(')[0])
df_year_in_en['Category 1'] = ''

#Concatenate datasets
df_year_en = pd.concat([df_year_in_en[df_year_in_en['Indicator'] == 'Geological Disasters'],
                        df_year_cat_en[df_year_cat_en['Indicator'] == 'Geological Disasters'],
                        df_year_in_en[df_year_in_en['Indicator'] == 'Casualties'],
                        df_year_cat_en[df_year_cat_en['Indicator'] == 'Casualties'], 
                        df_year_in_en[df_year_in_en['Indicator'] == 'Direct Econnomic Losses']])
df_year_en.reset_index(inplace=True)
df_year_en.drop(columns='index', inplace=True)

#Add additional columns
df_year_en['Group'] = 'Energy'
df_year_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_year_en['Source'] = file_name
df_year_en['Dataset'] = 'Geological Diasters, Prevention and Cure'
df_year_en['Category 2'] = ''
df_year_en['Category 3'] = ''
df_year_en['Category 4'] = ''
df_year_en['Region'] = 'China'
df_year_en['Region Type'] = 'Country'
df_year_en = df_year_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]


### Chinese
df_year_cn = df_year.drop(columns=['年份']).copy()
df_year_cn['Year'] = df_year_cn['Year'].fillna('')
df_year_cn.iloc[1] = df_year_cn.iloc[1].fillna('')

#Split dataset
df_year_cat_cn = df_year_cn.drop(columns=['发生地质 灾害数量', '人员伤亡', '直接经济损失']).copy()
df_year_in_cn = df_year_cn[['Year', '发生地质 灾害数量', '人员伤亡', '直接经济损失']].copy()

df_year_cat_cn.columns = list(df_year_cat_cn.iloc[0])
df_year_cat_cn.rename(columns={'Region': 'Year'}, inplace=True)
df_year_cat_cn.drop(index=[0,1,2], inplace=True)
df_year_cat_cn.reset_index(inplace=True)
df_year_cat_cn.drop(columns='index', inplace=True)

df_year_in_cn.columns = [x+ '' + y for x, y in zip(list(df_year_in_cn.columns), list(df_year_in_cn.iloc[0]))]
df_year_in_cn.rename(columns={'YearRegion': 'Year'}, inplace=True)
df_year_in_cn.drop(index=[0,1,2], inplace=True)
df_year_in_cn.reset_index(inplace=True)
df_year_in_cn.drop(columns='index', inplace=True)

#Unpivot values
df_year_cat_cn = df_year_cat_cn.melt(list(df_year_cat_cn.columns)[0], var_name='Category 1', value_name='Value')
df_year_cat_cn['Indicator'] = df_year_cat_cn['Category 1'].apply(lambda x: add_indicator_cn(x))
df_year_cat_cn['Unit'] = df_year_cat_cn['Category 1'].apply(lambda x: add_unit_cn(x))

df_year_in_cn = df_year_in_cn.melt(list(df_year_in_cn.columns)[0], var_name='Indicator', value_name='Value')
df_year_in_cn['Unit'] = df_year_in_cn['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_year_in_cn['Indicator'] = df_year_in_cn['Indicator'].apply(lambda x: x.split('(')[0])
df_year_in_cn['Category 1'] = ''

#Concatenate datasets
df_year_cn = pd.concat([df_year_in_cn[df_year_in_cn['Indicator'] == '发生地质 灾害数量'],
                        df_year_cat_cn[df_year_cat_cn['Indicator'] == '发生地质 灾害数量'],
                        df_year_in_cn[df_year_in_cn['Indicator'] == '人员伤亡'],
                        df_year_cat_cn[df_year_cat_cn['Indicator'] == '人员伤亡'], 
                        df_year_in_cn[df_year_in_cn['Indicator'] == '直接经济损失']])
df_year_cn.reset_index(inplace=True)
df_year_cn.drop(columns='index', inplace=True)

#Add additional columns
df_year_cn['Group'] = '能源'
df_year_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_year_cn['Source'] = file_name
df_year_cn['Dataset'] = '地质灾害及防治情况'
df_year_cn['Category 2'] = ''
df_year_cn['Category 3'] = ''
df_year_cn['Category 4'] = ''
df_year_cn['Region'] = '中国'
df_year_cn['Region Type'] = '国家'
df_year_cn = df_year_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]


In [725]:
df_year_cn

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2000,19653,处
1,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2005,17751,处
2,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2006,102804,处
3,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2007,25364,处
4,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2008,26580,处
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,直接经济损失,,,,,中国,国家,2015,250528,万元
124,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,直接经济损失,,,,,中国,国家,2016,354290,万元
125,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,直接经济损失,,,,,中国,国家,2017,359477,万元
126,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,直接经济损失,,,,,中国,国家,2018,147128,万元


In [727]:
### English
df_region_en = df_region.drop(columns=['年份']).copy()
df_region_en['Year'] = df_region_en['Year'].fillna('')
df_region_en.iloc[1] = df_region_en.iloc[1].fillna('')

#Split dataset
df_region_cat_en = df_region_en.drop(columns=['发生地质 灾害数量', '人员伤亡', '直接经济损失']).copy()
df_region_in_en = df_region_en[['Year', '发生地质 灾害数量', '人员伤亡', '直接经济损失']].copy()

df_region_cat_en.columns = [x+ '' + y for x, y in zip(list(df_region_cat_en.iloc[1]), list(df_region_cat_en.iloc[2]))]
df_region_cat_en.rename(columns={'': 'Region'}, inplace=True)
df_region_cat_en.drop(index=[0,1,2], inplace=True)
df_region_cat_en.reset_index(inplace=True)
df_region_cat_en.drop(columns='index', inplace=True)

df_region_in_en.columns = [x+ '' + y for x, y in zip(list(df_region_in_en.iloc[1]), list(df_region_in_en.iloc[2]))]
df_region_in_en.rename(columns={'': 'Region'}, inplace=True)
df_region_in_en.drop(index=[0,1,2], inplace=True)
df_region_in_en.reset_index(inplace=True)
df_region_in_en.drop(columns='index', inplace=True)

#Unpivot values
df_region_cat_en = df_region_cat_en.melt(list(df_region_cat_en.columns)[0], var_name='Category 1', value_name='Value')
df_region_cat_en['Indicator'] = df_region_cat_en['Category 1'].apply(lambda x: add_indicator_en(x))
df_region_cat_en['Unit'] = df_region_cat_en['Category 1'].apply(lambda x: add_unit_en(x))

df_region_in_en = df_region_in_en.melt(list(df_region_in_en.columns)[0], var_name='Indicator', value_name='Value')
df_region_in_en['Unit'] = df_region_in_en['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_region_in_en['Indicator'] = df_region_in_en['Indicator'].apply(lambda x: x.split('(')[0])
df_region_in_en['Category 1'] = ''

#Concatenate datasets
df_region_en = pd.concat([df_region_in_en[df_region_in_en['Indicator'] == 'Geological Disasters'],
                          df_region_cat_en[df_region_cat_en['Indicator'] == 'Geological Disasters'],
                          df_region_in_en[df_region_in_en['Indicator'] == 'Casualties'],
                          df_region_cat_en[df_region_cat_en['Indicator'] == 'Casualties'], 
                          df_region_in_en[df_region_in_en['Indicator'] == 'Direct Econnomic Losses']])
df_region_en.reset_index(inplace=True)
df_region_en.drop(columns='index', inplace=True)

#Add additional columns
df_region_en['Group'] = 'Energy'
df_region_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_region_en['Source'] = file_name
df_region_en['Dataset'] = 'Geological Diasters, Prevention and Cure'
df_region_en['Category 2'] = ''
df_region_en['Category 3'] = ''
df_region_en['Category 4'] = ''
df_region_en['Year'] = '2019'
df_region_en['Region Type'] = 'Province'
df_region_en = df_region_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_region_cn = df_region.drop(columns=['Year']).copy()
df_region_cn['年份'] = df_region_cn['年份'].fillna('')
df_region_cn.iloc[1] = df_region_cn.iloc[1].fillna('')

#Split dataset
df_region_cat_cn = df_region_cn.drop(columns=['发生地质 灾害数量', '人员伤亡', '直接经济损失']).copy()
df_region_in_cn = df_region_cn[['年份', '发生地质 灾害数量', '人员伤亡', '直接经济损失']].copy()

df_region_cat_cn.columns = list(df_region_cat_cn.iloc[0])
df_region_cat_cn.rename(columns={'地区': 'Region'}, inplace=True)
df_region_cat_cn.drop(index=[0,1,2], inplace=True)
df_region_cat_cn.reset_index(inplace=True)
df_region_cat_cn.drop(columns='index', inplace=True)

df_region_in_cn.columns = [x+ '' + y for x, y in zip(list(df_region_in_cn.columns), list(df_region_in_cn.iloc[0]))]
df_region_in_cn.rename(columns={'年份地区': 'Region'}, inplace=True)
df_region_in_cn.drop(index=[0,1,2], inplace=True)
df_region_in_cn.reset_index(inplace=True)
df_region_in_cn.drop(columns='index', inplace=True)

#Unpivot values
df_region_cat_cn = df_region_cat_cn.melt(list(df_region_cat_cn.columns)[0], var_name='Category 1', value_name='Value')
df_region_cat_cn['Indicator'] = df_region_cat_cn['Category 1'].apply(lambda x: add_indicator_cn(x))
df_region_cat_cn['Unit'] = df_region_cat_cn['Category 1'].apply(lambda x: add_unit_cn(x))

df_region_in_cn = df_region_in_cn.melt(list(df_region_in_cn.columns)[0], var_name='Indicator', value_name='Value')
df_region_in_cn['Unit'] = df_region_in_cn['Indicator'].apply(lambda x: x.split('(')[1].split(')')[0])
df_region_in_cn['Indicator'] = df_region_in_cn['Indicator'].apply(lambda x: x.split('(')[0])
df_region_in_cn['Category 1'] = ''

#Concatenate datasets
df_region_cn = pd.concat([df_region_in_cn[df_region_in_cn['Indicator'] == '发生地质 灾害数量'],
                          df_region_cat_cn[df_region_cat_cn['Indicator'] == '发生地质 灾害数量'],
                          df_region_in_cn[df_region_in_cn['Indicator'] == '人员伤亡'],
                          df_region_cat_cn[df_region_cat_cn['Indicator'] == '人员伤亡'], 
                          df_region_in_cn[df_region_in_cn['Indicator'] == '直接经济损失']])
df_region_cn.reset_index(inplace=True)
df_region_cn.drop(columns='index', inplace=True)

#Add additional columns
df_region_cn['Group'] = '能源'
df_region_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_region_cn['Source'] = file_name
df_region_cn['Dataset'] = '地质灾害及防治情况'
df_region_cn['Category 2'] = ''
df_region_cn['Category 3'] = ''
df_region_cn['Category 4'] = ''
df_region_cn['Year'] = 2019
df_region_cn['Region Type'] = '省份'
df_region_cn = df_region_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [732]:
df_geological_en = pd.concat([df_year_en, df_region_en])
df_geological_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",Geological Diasters and Prevention and Cure by...,"Geological Diasters, Prevention and Cure",Geological Disasters,,,,,China,Country,2000,19653,unit


In [733]:
df_geological_cn = pd.concat([df_year_cn, df_region_cn])
df_geological_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,Geological Diasters and Prevention and Cure by...,地质灾害及防治情况,发生地质 灾害数量,,,,,中国,国家,2000,19653,处


#### 1.9.4 Loss Caused by Natural Disasters by region
**Rearange tables**

In [1175]:
file_name = "Loss Caused by Natural Disasters by region.xls"

df = pd.read_excel(f'../data/Energy data/10Emissions&Natural Disaters&Air Quality≈≈∑≈◊‘»ª‘÷∫¶“‘º∞ø’∆¯÷ ¡øµ»/{file_name}', skiprows=3)

df_area = df[['地区', 'Region', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
              'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']].copy()

df_population = df[['地区', 'Region', 'Unnamed: 12', 'Unnamed: 13']].copy()

df_economic = df[['地区', 'Region', 'Unnamed: 14']].copy()

In [1176]:
indicators_area = {'Total Areas Affected of Farm Crops': ['Unnamed: 2', 'Unnamed: 3'], 
                   'Drought': ['Unnamed: 4', 'Unnamed: 5'], 
                   'Flood, Water Logging, Landslides and Debris Flow, Typhoon': ['Unnamed: 6', 'Unnamed: 7'],
                   'Wind and Hail': ['Unnamed: 8', 'Unnamed: 9'], 
                   'Low Temperature, Freezing and Snow Disaster': ['Unnamed: 10', 'Unnamed: 11']}

indicators_area_dict = {'Total Areas Affected of Farm Crops': '农作物受灾面积合计', 
                        'Drought': '旱灾', 
                        'Flood, Water Logging, Landslides and Debris Flow, Typhoon': '洪涝、地质灾害和台风',
                        'Wind and Hail': '风雹灾害', 
                        'Low Temperature, Freezing and Snow Disaster': '低温冷冻和雪灾'}

In [1177]:
df_area_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_area_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for indicator_en in indicators_area.keys():
    ### English
    df_en = df_area.drop(columns=['地区']).copy()
    df_en['Region'] = df_en['Region'].fillna('')
    df_en = df_en[['Region']+indicators_area[indicator_en]]
    df_en.columns = list(df_en.iloc[3])
    df_en.rename(columns={'': 'Region'}, inplace=True)
    df_en.drop(index=[0,1,2,3], inplace=True)
    df_en.iloc[0][0] = 'China'
    df_en['Region Type'] = ['Country']+['Province']*(len(df_en)-1)
    df_en = df_en[[list(df_en.columns)[-1]]+list(df_en.columns)[:-1]]
    
    #Unpivot values
    df_en = df_en.melt(list(df_en.columns)[:2], var_name='Category 1', value_name='Value')
    
    #Add additional columns
    df_en['Group'] = 'Energy'
    df_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
    df_en['Source'] = file_name
    df_en['Dataset'] = 'Loss Caused by Natural Disasters by region'
    df_en['Indicator'] = indicator_en
    df_en['Category 2'] = ''
    df_en['Category 3'] = ''
    df_en['Category 4'] = ''
    df_en['Year'] = 2019
    df_en['Unit'] = '1000 hectares'
    df_en = df_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

    df_area_en = pd.concat([df_area_en, df_en])
    
    ### Chinese
    df_cn = df_area.drop(columns=['Region']).copy()
    df_cn['地区'] = df_cn['地区'].fillna('')
    df_cn = df_cn[['地区']+indicators_area[indicator_en]]
    df_cn.columns = list(df_cn.iloc[2])
    df_cn.rename(columns={'': 'Region'}, inplace=True)
    df_cn.drop(index=[0,1,2,3], inplace=True)
    df_cn.iloc[0][0] = '中国'
    df_cn['Region Type'] = ['国家']+['省份']*(len(df_cn)-1)
    df_cn = df_cn[[list(df_cn.columns)[-1]]+list(df_cn.columns)[:-1]]
    
    
    #Unpivot values
    df_cn = df_cn.melt(list(df_cn.columns)[:2], var_name='Category 1', value_name='Value')
    
    ##Add additional columns
    df_cn['Group'] = '能源'
    df_cn['Sub-group'] = '排放，自然灾害和空气质量'
    df_cn['Source'] = file_name
    df_cn['Dataset'] = '分地区自然灾害损失情况(2019年)'
    df_cn['Indicator'] = indicators_area_dict[indicator_en]
    df_cn['Category 2'] = ''
    df_cn['Category 3'] = ''
    df_cn['Category 4'] = ''
    df_cn['Year'] = 2019
    df_cn['Unit'] = '千公顷'
    df_cn = df_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]
    
    df_area_cn = pd.concat([df_area_cn, df_cn])

In [1178]:
df_area_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region.xls,Loss Caused by Natural Disasters by region,Total Areas Affected of Farm Crops,Area Affected,,,,China,Country,2019,19256.9,1000 hectares


In [1179]:
df_area_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,Loss Caused by Natural Disasters by region.xls,分地区自然灾害损失情况(2019年),农作物受灾面积合计,受灾,,,,中国,国家,2019,19256.9,千公顷


In [1180]:
### English
df_population_en = df_population.drop(columns=['地区']).copy()
df_population_en['Region'] = df_population_en['Region'].fillna('')
df_population_en.columns = list(df_population_en.iloc[3])
df_population_en.rename(columns={'': 'Region'}, inplace=True)
df_population_en.drop(index=[0,1,2,3], inplace=True)
df_population_en.iloc[0][0] = 'China'
df_population_en['Region Type'] = ['Country']+['Province']*(len(df_population_en)-1)
df_population_en = df_population_en[[list(df_population_en.columns)[-1]]+list(df_population_en.columns)[:-1]]

#Unpivot values
df_population_en = df_population_en.melt(list(df_population_en.columns)[:2], var_name='Category 1', value_name='Value')
df_population_en['Unit'] = df_population_en['Category 1'].apply(lambda x: x.split('(')[1].split(')')[0])
df_population_en['Category 1'] = df_population_en['Category 1'].apply(lambda x: x.split('(')[0])

#Add additional columns
df_population_en['Group'] = 'Energy'
df_population_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_population_en['Source'] = file_name
df_population_en['Dataset'] = 'Loss Caused by Natural Disasters by region'
df_population_en['Indicator'] = 'Population'
df_population_en['Category 2'] = ''
df_population_en['Category 3'] = ''
df_population_en['Category 4'] = ''
df_population_en['Year'] = 2019
df_population_en['Unit'] = '1000 hectares'
df_population_en = df_population_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_population_cn = df_population.drop(columns=['Region']).copy()
df_population_cn['地区'] = df_population_cn['地区'].fillna('')
df_population_cn.columns = list(df_population_cn.iloc[2])
df_population_cn.rename(columns={'': 'Region'}, inplace=True)
df_population_cn.drop(index=[0,1,2,3], inplace=True)
df_population_cn.iloc[0][0] = '中国'
df_population_cn['Region Type'] = ['国家']+['省份']*(len(df_population_cn)-1)
df_population_cn = df_population_cn[[list(df_population_cn.columns)[-1]]+list(df_population_cn.columns)[:-1]]


#Unpivot values
df_population_cn = df_population_cn.melt(list(df_population_cn.columns)[:2], var_name='Category 1', value_name='Value')
df_population_cn['Unit'] = df_population_cn['Category 1'].apply(lambda x: x.split('(')[1].split(')')[0])
df_population_cn['Category 1'] = df_population_cn['Category 1'].apply(lambda x: x.split('(')[0])

#Add additional columns
df_population_cn['Group'] = '能源'
df_population_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_population_cn['Source'] = file_name
df_population_cn['Dataset'] = '分地区自然灾害损失情况(2019年)'
df_population_cn['Indicator'] = '人口受灾'
df_population_cn['Category 2'] = ''
df_population_cn['Category 3'] = ''
df_population_cn['Category 4'] = ''
df_population_cn['Year'] = 2019
df_population_cn['Unit'] = '千公顷'
df_population_cn = df_population_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [1181]:
### English
df_economic_en = df_economic.drop(columns=['地区']).copy()
df_economic_en['Region'] = df_economic_en['Region'].fillna('')
df_economic_en.columns = list(df_economic_en.iloc[1])
df_economic_en.rename(columns={'': 'Region', 'Direct economic loss (100 million yuan)': 'Value'}, inplace=True)
df_economic_en.drop(index=[0,1,2,3], inplace=True)
df_economic_en.iloc[0][0] = 'China'
df_economic_en['Region Type'] = ['Country']+['Province']*(len(df_economic_en)-1)
df_economic_en = df_economic_en[[list(df_economic_en.columns)[-1]]+list(df_economic_en.columns)[:-1]]

#Add additional columns
df_economic_en['Group'] = 'Energy'
df_economic_en['Sub-group'] = 'Emissions, Natural Disasters, and Air Quality'
df_economic_en['Source'] = file_name
df_economic_en['Dataset'] = 'Loss Caused by Natural Disasters by region'
df_economic_en['Indicator'] = 'Direct economic loss'
df_economic_en['Category 1'] = ''
df_economic_en['Category 2'] = ''
df_economic_en['Category 3'] = ''
df_economic_en['Category 4'] = ''
df_economic_en['Year'] = 2019
df_economic_en['Unit'] = '100 million yuan'
df_economic_en = df_economic_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_economic_cn = df_economic.drop(columns=['Region']).copy()
df_economic_cn['地区'] = df_economic_cn['地区'].fillna('')
df_economic_cn.columns = list(df_economic_cn.iloc[1])
df_economic_cn.rename(columns={'': 'Region', 'Direct economic loss (100 million yuan)': 'Value'}, inplace=True)
df_economic_cn.drop(index=[0,1,2,3], inplace=True)
df_economic_cn.iloc[0][0] = '中国'
df_economic_cn['Region Type'] = ['国家']+['省份']*(len(df_economic_cn)-1)
df_economic_cn = df_economic_cn[[list(df_economic_cn.columns)[-1]]+list(df_economic_cn.columns)[:-1]]

#Add additional columns
df_economic_cn['Group'] = '能源'
df_economic_cn['Sub-group'] = '排放，自然灾害和空气质量'
df_economic_cn['Source'] = file_name
df_economic_cn['Dataset'] = '分地区自然灾害损失情况(2019年)'
df_economic_cn['Indicator'] = '直接经济损失'
df_economic_cn['Category 1'] = ''
df_economic_cn['Category 2'] = ''
df_economic_cn['Category 3'] = ''
df_economic_cn['Category 4'] = ''
df_economic_cn['Year'] = 2019
df_economic_cn['Unit'] = '亿元'
df_economic_cn = df_economic_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [1182]:
df_natural_en = pd.concat([df_area_en, df_population_en, df_economic_en])
df_natural_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region.xls,Loss Caused by Natural Disasters by region,Total Areas Affected of Farm Crops,Area Affected,,,,China,Country,2019,19256.9,1000 hectares


In [1183]:
df_natural_cn = pd.concat([df_area_cn, df_population_cn, df_economic_cn])
df_natural_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,Loss Caused by Natural Disasters by region.xls,分地区自然灾害损失情况(2019年),农作物受灾面积合计,受灾,,,,中国,国家,2019,19256.9,千公顷


#### **Concatenate all datasets**

In [1184]:
df_emissions_disaters_en = pd.concat([df_emission_en, df_forest_en, df_geological_en, df_natural_en])
df_emissions_disaters_cn = pd.concat([df_emission_cn, df_forest_cn, df_geological_cn, df_natural_cn])

In [1185]:
df_emissions_disaters_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,"Emissions, Natural Disasters, and Air Quality",10Main Pollutant Emission in Waste Gas by Regi...,Main Pollutant Emission in Waste Gas by Region...,Emissions,Sulphur Dioxide,,,,China,Country,2017,875.398,10e+4 tons


In [1186]:
df_emissions_disaters_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,排放，自然灾害和空气质量,10Main Pollutant Emission in Waste Gas by Regi...,分地区废气中主要污染物排放情况 (2017年),排放物,二氧化硫,,,,中国,国家,2017,875.398,万吨


**Save tables**

In [1187]:
df_emissions_disaters_en.to_csv('../data/Clean_data/Energy/10_Emissions_NaturalDisaters_AirQuality/Emissions_NaturalDisaters_AirQuality_en.csv')
df_emissions_disaters_cn.to_csv('../data/Clean_data/Energy/10_Emissions_NaturalDisaters_AirQuality/Emissions_NaturalDisaters_AirQuality_cn.csv')

### 1.10 Energy Resource Reserves
**Datasets:**
- Identified Reserves of Major Energy Resources 
    - Identified Reserves of Major Energy Resources
    
#### 1.10.1 Identified Reserves of Major Energy Resources
**Rearange tables**

In [959]:
file_name = "Identified Reserves of Major Energy Resources.xls"

df = pd.read_excel(f'../data/Energy data/14Energy Resources Reservesƒ‹‘¥◊ ‘¥¡ø/{file_name}', skiprows=2)

### English
df_reserves_en = df.drop(columns=['矿产']).copy()
df_reserves_en.rename(columns={'Energy Resources': 'Category 1'}, inplace=True)

#Unpivot values
df_reserves_en = df_reserves_en.melt(list(df_reserves_en.columns)[0], var_name='Year', value_name='Value')
df_reserves_en['Unit'] = df_reserves_en['Category 1'].apply(lambda x: x.split('(')[1].split(')')[0])
df_reserves_en['Category 1'] = df_reserves_en['Category 1'].apply(lambda x: x.split('(')[0])

#Add additional columns
df_reserves_en['Group'] = 'Energy'
df_reserves_en['Sub-group'] = 'Energy Resource Reserves'
df_reserves_en['Source'] = file_name
df_reserves_en['Dataset'] = 'Identified Reserves of Major Energy Resources'
df_reserves_en['Indicator'] = 'Energy Resources'
df_reserves_en['Category 2'] = ''
df_reserves_en['Category 3'] = ''
df_reserves_en['Category 4'] = ''
df_reserves_en['Region'] = 'China'
df_reserves_en['Region Type'] = 'Country'
df_reserves_en = df_reserves_en[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

### Chinese
df_reserves_cn = df.drop(columns=['Energy Resources']).copy()
df_reserves_cn.rename(columns={'矿产': 'Category 1'}, inplace=True)

#Unpivot values
df_reserves_cn = df_reserves_cn.melt(list(df_reserves_cn.columns)[0], var_name='Year', value_name='Value')
df_reserves_cn['Unit'] = df_reserves_cn['Category 1'].apply(lambda x: x.split('(')[1].split(')')[0])
df_reserves_cn['Category 1'] = df_reserves_cn['Category 1'].apply(lambda x: x.split('(')[0])

#Add additional columns
df_reserves_cn['Group'] = '能源'
df_reserves_cn['Sub-group'] = '能源储备'
df_reserves_cn['Source'] = file_name
df_reserves_cn['Dataset'] = '主要矿产查明资源储量'
df_reserves_cn['Indicator'] = '矿物质'
df_reserves_cn['Category 2'] = ''
df_reserves_cn['Category 3'] = ''
df_reserves_cn['Category 4'] = ''
df_reserves_cn['Region'] = '中国'
df_reserves_cn['Region Type'] = '国家'
df_reserves_cn = df_reserves_cn[['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit']]

In [960]:
df_reserves_en.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Resource Reserves,Identified Reserves of Major Energy Resources.xls,Identified Reserves of Major Energy Resources,Energy Resources,Coal,,,,China,Country,2006,11597.8,billion tons


In [961]:
df_reserves_cn.head(1)

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源储备,Identified Reserves of Major Energy Resources.xls,主要矿产查明资源储量,矿物质,煤炭,,,,中国,国家,2006,11597.8,亿吨


**Save tables**

In [962]:
df_reserves_en.to_csv('../data/Clean_data/Energy/11_Energy_Resource_Reserves/Energy_Resource_Reserves_en.csv')
df_reserves_cn.to_csv('../data/Clean_data/Energy/11_Energy_Resource_Reserves/Energy_Resource_Reserves_cn.csv')

### Concatenate all Energy datasets

In [1249]:
groups = {'0_Energy_Balance': 'Energy_Balance', 
 '1_Energy_Consumption': 'Energy_Consumption', 
 '2_Energy_Supply': 'Energy_Supply', 
 '3_Energy_Efficiency': 'Energy_Efficiency', 
 '4_Energy_Trade': 'Energy_Trade', 
 '5_Energy_Investment': 'Energy_Investment',
 '6_Electricity': 'Electricity', 
 '9_Internantional_Comparison': 'Internantional_Comparison', 
 '10_Emissions_NaturalDisaters_AirQuality': 'Emissions_NaturalDisaters_AirQuality', 
 '11_Energy_Resource_Reserves': 'Energy_Resource_Reserves'}

df_all_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])
df_all_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Source', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Value', 'Unit'])

for group in groups.keys():
    df_en = pd.read_csv(f'../data/Clean_data/Energy/{group}/{groups[group]}_en.csv').drop(columns='Unnamed: 0')
    df_cn = pd.read_csv(f'../data/Clean_data/Energy/{group}/{groups[group]}_cn.csv').drop(columns='Unnamed: 0')
    
    df_all_en = pd.concat([df_all_en, df_en])
    df_all_cn = pd.concat([df_all_cn, df_cn])
    
df_all_en.reset_index(inplace=True)
df_all_en.drop(columns='index', inplace=True)

df_all_cn.reset_index(inplace=True)
df_all_cn.drop(columns='index', inplace=True)

In [1250]:
df_all_en.head()

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,Physical Quantity,Other Washed Coal,,,China,Country,2017,14563.692962,10e+4 tons
1,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,Physical Quantity,Other Coking Products,,,China,Country,2017,1077.40584046696,10e+4 tons
2,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,Physical Quantity,Other Gas,,,China,Country,2017,165.197243,10e+8 cu.m
3,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,Physical Quantity,Other Petroleum Products,,,China,Country,2017,3816.85252,10e+4 tons
4,Energy,Energy Balance,Energy Balance by region and by energy type-en...,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,Physical Quantity,Other Energy,,,China,Country,2017,-764.350026722366,10e+4 tce


In [1251]:
df_all_cn.head()

Unnamed: 0,Group,Sub-group,Source,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Value,Unit
0,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,实物量,其他洗煤,,,中国,国家,2017,14563.692962,万吨
1,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,实物量,其他焦化产品,,,中国,国家,2017,1077.40584046696,万吨
2,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,实物量,其他煤气,,,中国,国家,2017,165.197243,亿立方米
3,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,实物量,其他石油制品,,,中国,国家,2017,3816.85252,万吨
4,能源,能源平衡,Energy Balance by region and by energy type-cn...,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,实物量,其他能源,,,中国,国家,2017,-764.350026722366,万吨标煤


**Save tables**

In [1252]:
df_all_en.to_csv('../data/Clean_data/Energy/Energy_en.csv')
df_all_cn.to_csv('../data/Clean_data/Energy/Energy_cn.csv')

**Indicators table**

In [1253]:
df_indicators_en = df_all_en[['Group', 'Sub-group', 'Dataset', 'Indicator']].groupby(['Sub-group', 'Dataset', 'Indicator'], sort=False).first().reset_index()[['Group', 'Sub-group', 'Dataset', 'Indicator']]
df_indicators_en

Unnamed: 0,Group,Sub-group,Dataset,Indicator
0,Energy,Energy Balance,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation
1,Energy,Energy Balance,Energy Balance of China (2017),Total Primary Energy Supply
2,Energy,Energy Balance,Energy Balance of China (2017),Statistical Difference
3,Energy,Energy Balance,Energy Balance of China (2017),Loss
4,Energy,Energy Balance,Energy Balance of China (2017),Total Energy Consumption
...,...,...,...,...
80,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Wind and Hail
81,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,"Low Temperature, Freezing and Snow Disaster"
82,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Population
83,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Direct economic loss


In [1254]:
df_indicators_cn = df_all_cn[['Group', 'Sub-group', 'Dataset', 'Indicator']].groupby(['Sub-group', 'Dataset', 'Indicator'], sort=False).first().reset_index()[['Group', 'Sub-group', 'Dataset', 'Indicator']]
df_indicators_cn

Unnamed: 0,Group,Sub-group,Dataset,Indicator
0,能源,能源平衡,中国的能源平衡 (2017),加工转换投入(-)产出(+)量
1,能源,能源平衡,中国的能源平衡 (2017),可供本地区消费的能源量
2,能源,能源平衡,中国的能源平衡 (2017),平衡差额
3,能源,能源平衡,中国的能源平衡 (2017),损失量
4,能源,能源平衡,中国的能源平衡 (2017),消费量合计
...,...,...,...,...
80,能源,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),风雹灾害
81,能源,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),低温冷冻和雪灾
82,能源,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),人口受灾
83,能源,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),直接经济损失


**Save tables**

In [1228]:
df_indicators_en.to_csv('../data/Clean_data/Energy/Energy_indicators_en.csv')
df_indicators_cn.to_csv('../data/Clean_data/Energy/Energy_indicators_cn.csv')

**Structure table**

In [1238]:
group = 'Energy'
df_structure_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Unit'])
df_tmp_en = pd.DataFrame(columns=['Group', 'Sub-group', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Unit'])
    
for sub_group in list(df_all_en['Sub-group'].unique()):
    for dataset in list(df_all_en[df_all_en['Sub-group'] == sub_group]['Dataset'].unique()):
        for indicator in list(df_all_en[(df_all_en['Sub-group'] == sub_group) & (df_all_en['Dataset'] == dataset)]['Indicator'].unique()):
                    
            df = df_all_en[(df_all_en['Sub-group'] == sub_group) & 
                      (df_all_en['Dataset'] == dataset) &
                      (df_all_en['Indicator'] == indicator)].copy()
            
            df_tmp_en['Group'] = [group]
            df_tmp_en['Sub-group'] = [sub_group]
            df_tmp_en['Dataset'] = [dataset]
            df_tmp_en['Indicator'] = [indicator]
            
            df_tmp_en['Category 1'] = [list(df['Category 1'].unique())]
            df_tmp_en['Category 2'] = [list(df['Category 2'].unique())]
            df_tmp_en['Category 3'] = [list(df['Category 3'].unique())]
            df_tmp_en['Category 4'] = [list(df['Category 4'].unique())]
            df_tmp_en['Region'] = [list(df['Region'].unique())]
            df_tmp_en['Region Type'] = [list(df['Region Type'].unique())]
            df_tmp_en['Year'] = [list(df['Year'].unique())]
            df_tmp_en['Unit'] = [list(df['Unit'].unique())]
            
            df_structure_en = pd.concat([df_structure_en, df_tmp_en])
            
df_structure_en.reset_index(inplace=True)
df_structure_en.drop(columns='index', inplace=True)

In [1239]:
df_structure_en

Unnamed: 0,Group,Sub-group,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Unit
0,Energy,Energy Balance,Energy Balance of China (2017),Input(-) & Output(+) of Trans formation,"[Physical Quantity, Standard Quantity]","[Other Washed Coal, Other Coking Products, Oth...","[nan, Other Washed Coal, Other Gas, Other Pet...","[nan, Coke, Other Petroleum Products, Diesel ...",[China],[Country],[2017],"[10e+4 tons, 10e+8 cu.m, 10e+4 tce, 10e+10 kJ,..."
1,Energy,Energy Balance,Energy Balance of China (2017),Total Primary Energy Supply,"[Physical Quantity, Standard Quantity]","[Other Washed Coal, Other Petroleum Products, ...","[nan, Other Energy, Crude Oil, Raw Coal, Nat...","[nan, Electricity, Energy Total coal equivale...",[China],[Country],[2017],"[10e+4 tons, 10e+4 tce, 10e+8 cu.m, 10e+8 kW•h..."
2,Energy,Energy Balance,Energy Balance of China (2017),Statistical Difference,"[Physical Quantity, Standard Quantity]","[Other Washed Coal, Other Coking Products, Oth...",[nan],[nan],[China],[Country],[2017],"[10e+4 tons, 10e+8 cu.m, 10e+4 tce, 10e+10 kJ,..."
3,Energy,Energy Balance,Energy Balance of China (2017),Loss,"[Physical Quantity, Standard Quantity]","[Crude Oil, Natural Gas, Petroleum Products ...",[nan],[nan],[China],[Country],[2017],"[10e+4 tons, 10e+8 cu.m, 10e+10 kJ, 10e+8 kW•h..."
4,Energy,Energy Balance,Energy Balance of China (2017),Total Energy Consumption,"[Physical Quantity, Standard Quantity]","[Other Washed Coal, Other Coking Products, Oth...",[nan],[nan],[China],[Country],[2017],"[10e+4 tons, 10e+8 cu.m, 10e+4 tce, 10e+10 kJ,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
80,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Wind and Hail,"[Area Affected, Total Crop Failure]",[nan],[nan],[nan],"[China, Beijing, Tianjin, Hebei, Shanxi, Inner...","[Country, Province]",[2019],[1000 hectares]
81,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,"Low Temperature, Freezing and Snow Disaster","[Area Affected, Total Crop Failure]",[nan],[nan],[nan],"[China, Beijing, Tianjin, Hebei, Shanxi, Inner...","[Country, Province]",[2019],[1000 hectares]
82,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Population,"[Population affected , Deaths ]",[nan],[nan],[nan],"[China, Beijing, Tianjin, Hebei, Shanxi, Inner...","[Country, Province]",[2019],[1000 hectares]
83,Energy,"Emissions, Natural Disasters, and Air Quality",Loss Caused by Natural Disasters by region,Direct economic loss,[nan],[nan],[nan],[nan],"[China, Beijing, Tianjin, Hebei, Shanxi, Inner...","[Country, Province]",[2019],[100 million yuan]


In [1240]:
group = 'Energy'
df_structure_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Unit'])
df_tmp_cn = pd.DataFrame(columns=['Group', 'Sub-group', 'Dataset', 'Indicator', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Region', 'Region Type', 'Year', 'Unit'])
    
for sub_group in list(df_all_cn['Sub-group'].unique()):
    for dataset in list(df_all_cn[df_all_cn['Sub-group'] == sub_group]['Dataset'].unique()):
        for indicator in list(df_all_cn[(df_all_cn['Sub-group'] == sub_group) & (df_all_cn['Dataset'] == dataset)]['Indicator'].unique()):
                    
            df = df_all_cn[(df_all_cn['Sub-group'] == sub_group) & 
                      (df_all_cn['Dataset'] == dataset) &
                      (df_all_cn['Indicator'] == indicator)].copy()
            
            df_tmp_cn['Group'] = [group]
            df_tmp_cn['Sub-group'] = [sub_group]
            df_tmp_cn['Dataset'] = [dataset]
            df_tmp_cn['Indicator'] = [indicator]
        
            df_tmp_cn['Category 1'] = [list(df['Category 1'].unique())]
            df_tmp_cn['Category 2'] = [list(df['Category 2'].unique())]
            df_tmp_cn['Category 3'] = [list(df['Category 3'].unique())]
            df_tmp_cn['Category 4'] = [list(df['Category 4'].unique())]
            df_tmp_cn['Region'] = [list(df['Region'].unique())]
            df_tmp_cn['Region Type'] = [list(df['Region Type'].unique())]
            df_tmp_cn['Year'] = [list(df['Year'].unique())]
            df_tmp_cn['Unit'] = [list(df['Unit'].unique())]
            
            df_structure_cn = pd.concat([df_structure_cn, df_tmp_cn])
            
df_structure_cn.reset_index(inplace=True)
df_structure_cn.drop(columns='index', inplace=True)

In [1241]:
df_structure_cn

Unnamed: 0,Group,Sub-group,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Unit
0,Energy,能源平衡,中国的能源平衡 (2017),加工转换投入(-)产出(+)量,"[实物量, 标准量]","[其他洗煤, 其他焦化产品, 其他煤气, 其他石油制品, 其他能源, 原油, 原煤, 型煤,...","[nan, 其他洗煤, 其他煤气, 其他石油制品, 其他能源, 原煤, 天然气, 柴油, 汽...","[nan, 焦炭, 其他石油制品, 柴油, 汽油, 油品合计, 润滑油, 液化石油气, 炼厂...",[中国],[国家],[2017],"[万吨, 亿立方米, 万吨标煤, 万百万千焦, 亿千瓦小时, 万吨标准煤]"
1,Energy,能源平衡,中国的能源平衡 (2017),可供本地区消费的能源量,"[实物量, 标准量]","[其他洗煤, 其他石油制品, 其他能源, 原油, 原煤, 型煤, 天然气, 柴油, 汽油, ...","[nan, 其他能源, 原油, 原煤, 天然气, 油品合计, 煤合计, 电力, 核电, 水电...","[nan, 电力, 发电煤耗计算法, 电热当量计算法Energy Total calorif...",[中国],[国家],[2017],"[万吨, 万吨标煤, 亿立方米, 亿千瓦小时, 万吨标准煤]"
2,Energy,能源平衡,中国的能源平衡 (2017),平衡差额,"[实物量, 标准量]","[其他洗煤, 其他焦化产品, 其他煤气, 其他石油制品, 其他能源, 原油, 原煤, 型煤,...","[nan, 发电煤耗计算法, 电热当量计算法Energy Total calorific v...",[nan],[中国],[国家],[2017],"[万吨, 亿立方米, 万吨标煤, 万百万千焦, 万吨标准煤]"
3,Energy,能源平衡,中国的能源平衡 (2017),损失量,"[实物量, 标准量]","[原油, 天然气, 油品合计, 液化天然气, 液化石油气, 热力, 电力, 能源合计]","[nan, 发电煤耗计算法, 电热当量计算法Energy Total calorific v...",[nan],[中国],[国家],[2017],"[万吨, 亿立方米, 万百万千焦, 亿千瓦小时, 万吨标准煤]"
4,Energy,能源平衡,中国的能源平衡 (2017),消费量合计,"[实物量, 标准量]","[其他洗煤, 其他焦化产品, 其他煤气, 其他石油制品, 其他能源, 原油, 原煤, 型煤,...","[nan, 发电煤耗计算法, 电热当量计算法Energy Total calorific v...",[nan],[中国],[国家],[2017],"[万吨, 亿立方米, 万吨标煤, 万百万千焦, 亿千瓦小时, 万吨标准煤]"
...,...,...,...,...,...,...,...,...,...,...,...,...
80,Energy,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),风雹灾害,"[受灾, 绝收]",[nan],[nan],[nan],"[中国, 北 京, 天 津, 河 北, 山 西, 内蒙古, 辽 宁, ...","[国家, 省份]",[2019],[千公顷]
81,Energy,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),低温冷冻和雪灾,"[受灾, 绝收]",[nan],[nan],[nan],"[中国, 北 京, 天 津, 河 北, 山 西, 内蒙古, 辽 宁, ...","[国家, 省份]",[2019],[千公顷]
82,Energy,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),人口受灾,"[受灾人口 , 死亡人口 ]",[nan],[nan],[nan],"[中国, 北 京, 天 津, 河 北, 山 西, 内蒙古, 辽 宁, ...","[国家, 省份]",[2019],[千公顷]
83,Energy,排放，自然灾害和空气质量,分地区自然灾害损失情况(2019年),直接经济损失,[nan],[nan],[nan],[nan],"[中国, 北 京, 天 津, 河 北, 山 西, 内蒙古, 辽 宁, ...","[国家, 省份]",[2019],[亿元]


In [1245]:
df_structure_en.iloc[36:40]

Unnamed: 0,Group,Sub-group,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Unit
36,Energy,Energy Efficiency,Energy Intensity by GDP,Energy Intensity by GDP,"[GDP is calculated at 1980 constant prices, GD...","[Total Energy, Coal, Coke, Petroleum, Crude Oi...",[nan],[nan],[China],[Country],"[1980, 1981, 1982, 1983, 1984, 1985, 1986, 198...",[%]
37,Energy,Energy Trade,Imports and Exports of Major Energy Products,Import volume,"[Coal, Coke, Crude Oil, Gasoline, Kerosene, Di...",[nan],[nan],[nan],[China],[Country],"[2000, 2005, 2010, 2011, 2012, 2013, 2014, 201...","[10e+4 tons, 10e+8 cu.m, 10e+8 kW•h]"
38,Energy,Energy Trade,Imports and Exports of Major Energy Products,Export volume,"[Coal, Coke and Semi-coke, Crude Oil, Gasoline...",[nan],[nan],[nan],[China],[Country],"[2000, 2005, 2010, 2011, 2012, 2013, 2014, 201...","[10e+4 tons, 10e+8 cu.m, 10e+8 kW•h]"
39,Energy,Energy Investment,Investment in Energy Industry,Investment,"[Energy Industry, Coal Mining and Processing, ...",[nan],[nan],[nan],[China],[Country],"[1995, 2000, 2005, 2010, 2011, 2012, 2013, 201...",[100 million yuan]


In [1246]:
df_structure_cn.iloc[36:40]

Unnamed: 0,Group,Sub-group,Dataset,Indicator,Category 1,Category 2,Category 3,Category 4,Region,Region Type,Year,Unit
36,Energy,能源效率,平均每万元国内生产总值能源消费量,平均每万元国内生产总值能源消费量,"[国内生产总值按1980年可比价格计算, 国内生产总值按1990年可比价格计算, 国内生产总...","[总能量, 煤炭, 焦炭, 石油, 原油, 燃料油, 电力]",[nan],[nan],[中国],[国家],"[1980, 1981, 1982, 1983, 1984, 1985, 1986, 198...",[%]
37,Energy,Energy Trade,Imports and Exports of Major Energy Products,Import volume,"[Coal, Coke, Crude Oil, Gasoline, Kerosene, Di...",[nan],[nan],[nan],[China],[Country],"[2000, 2005, 2010, 2011, 2012, 2013, 2014, 201...","[10e+4 tons, 10e+8 cu.m, 10e+8 kW•h]"
38,Energy,Energy Trade,Imports and Exports of Major Energy Products,Export volume,"[Coal, Coke and Semi-coke, Crude Oil, Gasoline...",[nan],[nan],[nan],[China],[Country],"[2000, 2005, 2010, 2011, 2012, 2013, 2014, 201...","[10e+4 tons, 10e+8 cu.m, 10e+8 kW•h]"
39,Energy,能源投资,能源工业分行业投资,投资,"[能源工业, 煤炭采选业, 石油和天然气开采业, 电力、蒸汽、热水生产和供应业, 石油加工及...",[nan],[nan],[nan],[中国],[国家],"[1995, 2000, 2005, 2010, 2011, 2012, 2013, 201...",[亿元]


In [1247]:
df_indicators_cn.iloc[36:40]

Unnamed: 0,Group,Sub-group,Dataset,Indicator
36,能源,能源效率,平均每万元国内生产总值能源消费量,平均每万元国内生产总值能源消费量
37,Energy,Energy Trade,Imports and Exports of Major Energy Products,Import volume
38,Energy,Energy Trade,Imports and Exports of Major Energy Products,Export volume
39,能源,能源投资,能源工业分行业投资,投资


**Save tables**

In [1203]:
df_structure_en.to_csv('../data/Clean_data/Energy/Energy_structure_en.csv')
df_structure_cn.to_csv('../data/Clean_data/Energy/Energy_structure_cn.csv')