# Home task: pandas 

In [1]:
import pandas as pd
import numpy as np

### Question 1

- Load the energy data from the file [Energy Indicators.xls](http://unstats.un.org/unsd/environment/excel_file_tables/2013/Energy%20Indicators.xls).
It is a list of indicators of energy supply and renewable electricity production from the United Nations for the year 2013.


- It should be put into a DataFrame with the variable name of "energy"


In [2]:
# I took data from the repository
raw_energy = pd.read_excel('Energy Indicators.xls')
raw_energy.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,Environmental Indicators: Energy,,,
1,,,,,,
2,,,Energy Supply and Renewable Electricity Produc...,,,
3,,,,,,
4,,,,,,Last update: December 2015


- Make sure to exclude the footer and header information from the datafile.

In [3]:
# header
raw_energy.head(20)

# header ends at the 17th row

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,Environmental Indicators: Energy,,,
1,,,,,,
2,,,Energy Supply and Renewable Electricity Produc...,,,
3,,,,,,
4,,,,,,Last update: December 2015
5,,,,,,
6,,,Choose a country from the following drop-down ...,,,Afghanistan
7,,,,,,
8,,,Country,Energy Supply,Energy Supply per capita,Renewable Electricity Production
9,,,,Petajoules,Gigajoules,%


In [4]:
# footer
raw_energy.tail(50)
# footer starts at the 245th row

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
232,,United Republic of Tanzania,United Republic of Tanzania,994,20,31.07284
233,,United States of America,United States of America20,90838,286,11.57098
234,,United States Virgin Islands,United States Virgin Islands,...,...,0.0
235,,Uruguay,Uruguay,196,58,71.60504
236,,Uzbekistan,Uzbekistan,1798,62,21.32841
237,,Vanuatu,Vanuatu,3,10,14.70588
238,,Venezuela (Bolivarian Republic of),Venezuela (Bolivarian Republic of),2871,95,67.83452
239,,Viet Nam,Viet Nam,2554,28,45.32152
240,,Wallis and Futuna Islands,Wallis and Futuna Islands,0,26,0.0
241,,Yemen,Yemen,344,13,0.0


- The first two columns are unneccessary, so you should get rid of them, and you should change the column labels so that the columns are:<br>
`['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']`


In [11]:
# save dataframe without header and footer and first two columns
energy = raw_energy.iloc[17:244,2:]
energy.head()

Unnamed: 0,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
17,Afghanistan,321,10,78.66928
18,Albania,102,35,100.0
19,Algeria,1959,51,0.55101
20,American Samoa,...,...,0.641026
21,Andorra,9,121,88.69565


In [12]:
# change column names
energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']
energy.head()


Unnamed: 0,Country,Energy Supply,Energy Supply per Capita,% Renewable
17,Afghanistan,321,10,78.66928
18,Albania,102,35,100.0
19,Algeria,1959,51,0.55101
20,American Samoa,...,...,0.641026
21,Andorra,9,121,88.69565


In [None]:
# energy_copy = energy.copy()
# # energy = energy_copy.copy()


- For all countries which have missing data (e.g. data with `...`) make sure this is reflected as `np.NaN` values.

<font color = 'red'>
!!! energy indeces start not from 0
</font>

In [8]:
# Check 'Country'
energy[energy['Country'] == '...'].head()

Unnamed: 0,Country,Energy Supply,Energy Supply per Capita,% Renewable


In [13]:
for column in energy.columns[1:]:
    energy[column] = pd.to_numeric(energy[column], errors = 'coerce')
energy.head()

Unnamed: 0,Country,Energy Supply,Energy Supply per Capita,% Renewable
17,Afghanistan,321.0,10.0,78.66928
18,Albania,102.0,35.0,100.0
19,Algeria,1959.0,51.0,0.55101
20,American Samoa,,,0.641026
21,Andorra,9.0,121.0,88.69565


In [10]:
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 17 to 243
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country                   227 non-null    object 
 1   Energy Supply             222 non-null    float64
 2   Energy Supply per Capita  222 non-null    float64
 3   % Renewable               227 non-null    float64
dtypes: float64(3), object(1)
memory usage: 7.2+ KB


In [None]:
# energy_num = energy.copy()
# energy = energy_num.copy()

- Convert `Energy Supply` to gigajoules (there are 1,000,000 gigajoules in a petajoule).

In [14]:
energy['Energy Supply'] = energy['Energy Supply']*1000000
energy['Energy Supply'].head()

17    3.210000e+08
18    1.020000e+08
19    1.959000e+09
20             NaN
21    9.000000e+06
Name: Energy Supply, dtype: float64

In [None]:
# energy_gigsup = energy.copy()
# energy = energy_gigsup.copy()

- There are also several countries with numbers and/or parenthesis in their name. Be sure to remove these, e.g.:
    - `Bolivia (Plurinational State of)` should be `Bolivia`,
    - `Switzerland17` should be `Switzerland`.

In [28]:
num_con = energy.loc[energy['Country'].str.contains('\\d'), 'Country']
num_con

28                                            Australia1
59                                                China2
60       China, Hong Kong Special Administrative Region3
61           China, Macao Special Administrative Region4
75                                              Denmark5
90                                               France6
100                                           Greenland7
114                                           Indonesia8
120                                               Italy9
122                                              Japan10
128                                             Kuwait11
160                                        Netherlands12
178                                           Portugal13
194                                       Saudi Arabia14
196                                             Serbia15
207                                              Spain16
214                                        Switzerland17
229                            

In [29]:
print(energy['Country'].isna().sum())
energy['Country'] = energy['Country'].str.replace('\\d','', regex = True)
print(energy['Country'].isna().sum())
energy.loc[energy['Country'].str.contains('\\d')]

0
0


Unnamed: 0,Country,Energy Supply,Energy Supply per Capita,% Renewable


In [30]:
par_con = energy.loc[energy['Country'].str.contains('(', regex = False), 'Country']
# par_con.str.replace(' (.*', '', regex = True)   
par_con #.str.replace(r' \(.*', '', regex=True)

41       Bolivia (Plurinational State of)
87            Falkland Islands (Malvinas)
115            Iran (Islamic Republic of)
150      Micronesia (Federated States of)
200             Sint Maarten (Dutch part)
238    Venezuela (Bolivarian Republic of)
Name: Country, dtype: object

In [22]:
r = pd.Series(['text (ssd)'])


In [33]:
r.str.replace('[(].*','r',regex=True)

0    text r
dtype: object

In [31]:
par_con.str.replace(' [(].*', '', regex=True).iloc[0]

'Bolivia'

In [32]:
energy['Country'] = energy['Country'].str.replace(' [(].*', '', regex=True)
energy.loc[energy['Country'].str.contains('(', regex = False), 'Country']

Series([], Name: Country, dtype: object)

In [33]:
print(energy['Country'].isna().sum())

0


In [None]:
# clean_con = energy.copy()

- Rename the following list of countries (for use in later questions):
    - `Republic of Korea`: `South Korea`,
    - `United States of America`: `United States`,
    - `United Kingdom of Great Britain and Northern Ireland`: `United Kingdom`,
    - `China, Hong Kong Special Administrative Region`: `Hong Kong`

In [7]:
countries_mapping = {
    'Republic of Korea': 'South Korea',
    'United States of America': 'United States',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'China, Hong Kong Special Administrative Region': 'Hong Kong'
}

In [None]:
# pd.set_option('display.max_colwidth', None)
# pd.reset_option('display.max_colwidth')

In [40]:
energy.loc[energy['Country'].str.contains('United Kingdom'), 'Country']

231    United Kingdom
Name: Country, dtype: object

In [39]:
energy['Country'] = energy['Country'].replace(countries_mapping, regex=True)

In [41]:
energy['Country'].isna().sum()

0

In [None]:
# en_renamed = energy.copy()

- Next, load the GDP data from the file ["world_bank.csv"](http://data.worldbank.org/indicator/NY.GDP.MKTP.CD). 
It is a csv containing countries' GDP from 1960 to 2015 from World Bank. Call this DataFrame "GDP"

In [42]:
GDP = pd.read_csv('API_NY.GDP.MKTP.CD_DS2_en_csv_v2_19294.csv', skiprows = 4)
GDP

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2.983635e+09,3.092429e+09,3.276184e+09,3.395799e+09,2.481857e+09,2.929447e+09,3.279344e+09,3.648573e+09,,
1,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,2.421063e+10,2.496398e+10,2.707880e+10,3.177575e+10,3.028579e+10,3.381317e+10,...,8.289428e+11,9.729989e+11,1.012306e+12,1.009721e+12,9.333918e+11,1.085745e+12,1.191423e+12,1.245472e+12,,
2,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,1.811657e+10,1.875346e+10,1.805322e+10,1.879944e+10,1.995593e+10,1.426000e+10,1.449724e+10,1.723305e+10,,
3,Africa Western and Central,AFW,GDP (current US$),NY.GDP.MKTP.CD,1.190495e+10,1.270788e+10,1.363076e+10,1.446909e+10,1.580376e+10,1.692109e+10,...,6.943610e+11,6.878492e+11,7.704950e+11,8.264838e+11,7.898017e+11,8.493124e+11,8.839739e+11,7.991060e+11,,
4,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,5.276162e+10,7.369015e+10,7.945069e+10,7.089796e+10,4.850156e+10,6.650513e+10,1.043997e+11,8.482465e+10,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,6.682677e+09,7.180765e+09,7.878760e+09,7.899738e+09,7.717145e+09,9.413404e+09,9.354903e+09,1.046822e+10,,
262,"Yemen, Rep.",YEM,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,3.131782e+10,2.684223e+10,2.160616e+10,,,,,,,
263,South Africa,ZAF,GDP (current US$),NY.GDP.MKTP.CD,8.748597e+09,9.225996e+09,9.813996e+09,1.085420e+10,1.195600e+10,1.306899e+10,...,3.235855e+11,3.814488e+11,4.052607e+11,3.893300e+11,3.379747e+11,4.208869e+11,4.069200e+11,3.806993e+11,,
264,Zambia,ZMB,GDP (current US$),NY.GDP.MKTP.CD,6.987397e+08,6.823597e+08,6.792797e+08,7.043397e+08,8.226397e+08,1.061200e+09,...,2.095841e+10,2.587360e+10,2.631151e+10,2.330867e+10,1.813776e+10,2.209642e+10,2.916378e+10,2.757796e+10,,


In [None]:
# GDP_copy = GDP.copy()

I want to drop last column

In [54]:
GDP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 70 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    266 non-null    object 
 1   Country Code    266 non-null    object 
 2   Indicator Name  266 non-null    object 
 3   Indicator Code  266 non-null    object 
 4   1960            151 non-null    float64
 5   1961            154 non-null    float64
 6   1962            157 non-null    float64
 7   1963            157 non-null    float64
 8   1964            157 non-null    float64
 9   1965            163 non-null    float64
 10  1966            164 non-null    float64
 11  1967            167 non-null    float64
 12  1968            168 non-null    float64
 13  1969            168 non-null    float64
 14  1970            190 non-null    float64
 15  1971            191 non-null    float64
 16  1972            191 non-null    float64
 17  1973            191 non-null    flo

- Make sure to skip the header, and rename the following list of countries:
    - `Korea, Rep.`: `South Korea`,
    - `Iran, Islamic Rep.`: `Iran`,
    - `Hong Kong SAR, China`: `Hong Kong`

In [43]:
countries_mappinggdp = {    
    'Korea, Rep.': 'South Korea',
     'Iran, Islamic Rep.': 'Iran',
     'Hong Kong SAR, China': 'Hong Kong'}

In [46]:
GDP[GDP['Country Name'] == 'Hong Kong']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
96,Hong Kong,HKG,GDP (current US$),NY.GDP.MKTP.CD,1320797000.0,1383682000.0,1612346000.0,1935298000.0,2206466000.0,2435079000.0,...,320860300000.0,341273300000.0,361731100000.0,363074500000.0,344943100000.0,368954200000.0,358681100000.0,380812200000.0,,


In [None]:
GDP[GDP['Country Name'] == 'Korea, Rep.']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
126,"Korea, Rep.",KOR,GDP (current US$),NY.GDP.MKTP.CD,3958812000.0,2417629000.0,2814615000.0,3988462000.0,3459020000.0,3120861000.0,...,1499680000000.0,1623074000000.0,1725373000000.0,1651423000000.0,1644313000000.0,1818432000000.0,1673917000000.0,1712793000000.0,,


In [45]:
GDP['Country Name'] = GDP['Country Name'].replace(countries_mappinggdp)

In [60]:
GDP[GDP['Country Name'] == 'Korea, Rep.']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69


In [61]:
GDP[GDP['Country Name'] == 'South Korea']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
126,South Korea,KOR,GDP (current US$),NY.GDP.MKTP.CD,3958812000.0,2417629000.0,2814615000.0,3988462000.0,3459020000.0,3120861000.0,...,1499680000000.0,1623074000000.0,1725373000000.0,1651423000000.0,1644313000000.0,1818432000000.0,1673917000000.0,1712793000000.0,,


- Finally, load the "Sciamgo Journal and Country Rank data for [Energy Engineering and Power Technology"](http://www.scimagojr.com/countryrank.php?category=2102). It ranks countries based on their journal contributions in the aforementioned area. Call this DataFrame "ScimEn"


In [47]:
# taken from the repository
ScimEn = pd.read_excel('scimagojr.xlsx')
ScimEn.head()

Unnamed: 0,Rank,Country,Region,Documents,Citable documents,Citations,Self-citations,Citations per document,H index
0,1,China,Asiatic Region,273437,272374,2336764,1615239,8.55,245
1,2,United States,Northern America,175891,172431,2230544,724472,12.68,363
2,3,India,Asiatic Region,55082,53775,463165,162944,8.41,181
3,4,Japan,Asiatic Region,50523,50065,488062,119930,9.66,193
4,5,United Kingdom,Western Europe,43389,42284,615670,111290,14.19,226


In [None]:
# ScimEn_copy = ScimEn.copy()

- Join the three datasets: Energy, GDP, and ScimEn into a new dataset (using the intersection of country names). Use only the 10 years (2006-2015) of GDP data and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15).

In [48]:
for column in GDP.columns:
    if (column.isdigit() and (int(column) < 2006 or int(column) > 2015)) or (not column.isdigit() and column != 'Country Name') :
        GDP = GDP.drop(columns = column)

GDP.columns

Index(['Country Name', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015'],
      dtype='object')

In [None]:
# GDP_0615 = GDP.copy()

In [None]:
# GDP_noUnnamed = GDP.copy()

In [49]:
ScimEn.shape

(208, 9)

In [50]:
ScimEn = ScimEn.iloc[:15]
ScimEn.shape

(15, 9)

In [51]:
energy.shape, GDP.shape

((227, 4), (266, 11))

- The index of this DataFrame should be the name of the country, and the columns should be<br>
`['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', 2012', '2013', '2014', '2015']`

Function "answer_one" should return the resulted DataFrame (20 columns and 15 entries)

In [54]:
ScimEn = ScimEn.drop(columns = 'Region')
ScimEn.columns

Index(['Rank', 'Country', 'Documents', 'Citable documents', 'Citations',
       'Self-citations', 'Citations per document', 'H index'],
      dtype='object')

In [55]:
energy.columns

Index(['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'], dtype='object')

In [56]:
sci_en = pd.merge(ScimEn, energy, on = 'Country')
sci_en['Country'].isna().sum()

0

In [57]:
sci_en.shape

(15, 11)

In [58]:
sci_en.head()

Unnamed: 0,Rank,Country,Documents,Citable documents,Citations,Self-citations,Citations per document,H index,Energy Supply,Energy Supply per Capita,% Renewable
0,1,China,273437,272374,2336764,1615239,8.55,245,127191000000.0,93.0,19.75491
1,2,United States,175891,172431,2230544,724472,12.68,363,90838000000.0,286.0,11.57098
2,3,India,55082,53775,463165,162944,8.41,181,33195000000.0,26.0,14.96908
3,4,Japan,50523,50065,488062,119930,9.66,193,18984000000.0,149.0,10.23282
4,5,United Kingdom,43389,42284,615670,111290,14.19,226,7920000000.0,124.0,10.60047


In [61]:
GDP = GDP.rename(columns={'Country Name': 'Country'})
GDP.columns

Index(['Country', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015'],
      dtype='object')

In [62]:
full_merge = pd.merge(sci_en, GDP, on = 'Country')
full_merge.shape

(15, 21)

In [63]:
full_merge.columns

Index(['Rank', 'Country', 'Documents', 'Citable documents', 'Citations',
       'Self-citations', 'Citations per document', 'H index', 'Energy Supply',
       'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015'],
      dtype='object')

In [None]:
# full_merge_copy = full_merge.copy()

In [66]:
full_merge = full_merge.set_index(['Country'])
full_merge

Unnamed: 0_level_0,Rank,Documents,Citable documents,Citations,Self-citations,Citations per document,H index,Energy Supply,Energy Supply per Capita,% Renewable,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
China,1,273437,272374,2336764,1615239,8.55,245,127191000000.0,93.0,19.75491,2752119000000.0,3550328000000.0,4594337000000.0,5101691000000.0,6087192000000.0,7551546000000.0,8532185000000.0,9570471000000.0,10475620000000.0,11061570000000.0
United States,2,175891,172431,2230544,724472,12.68,363,90838000000.0,286.0,11.57098,13815580000000.0,14474230000000.0,14769860000000.0,14478070000000.0,15048970000000.0,15599730000000.0,16253970000000.0,16880680000000.0,17608140000000.0,18295020000000.0
India,3,55082,53775,463165,162944,8.41,181,33195000000.0,26.0,14.96908,940259900000.0,1216736000000.0,1198895000000.0,1341888000000.0,1675616000000.0,1823052000000.0,1827638000000.0,1856722000000.0,2039126000000.0,2103588000000.0
Japan,4,50523,50065,488062,119930,9.66,193,18984000000.0,149.0,10.23282,4601663000000.0,4579751000000.0,5106679000000.0,5289493000000.0,5759072000000.0,6233147000000.0,6272363000000.0,5212328000000.0,4896994000000.0,4444931000000.0
United Kingdom,5,43389,42284,615670,111290,14.19,226,7920000000.0,124.0,10.60047,2708442000000.0,3090510000000.0,2929412000000.0,2412840000000.0,2485483000000.0,2663806000000.0,2707090000000.0,2784854000000.0,3064708000000.0,2927911000000.0
Germany,6,38739,38013,433148,95145,11.18,196,13261000000.0,165.0,17.90153,3046309000000.0,3484057000000.0,3808786000000.0,3479801000000.0,3468154000000.0,3824829000000.0,3597897000000.0,3808086000000.0,3965801000000.0,3423568000000.0
Russian Federation,7,36735,36560,115938,54993,3.16,90,30709000000.0,214.0,17.28868,989932100000.0,1299703000000.0,1660848000000.0,1222646000000.0,1524917000000.0,2045923000000.0,2208294000000.0,2292470000000.0,2059242000000.0,1363482000000.0
Canada,8,33472,32863,568080,100953,16.97,227,10431000000.0,296.0,61.94543,1319265000000.0,1468820000000.0,1552990000000.0,1374625000000.0,1617343000000.0,1793327000000.0,1828366000000.0,1846597000000.0,1805750000000.0,1556509000000.0
Italy,9,27983,26940,352993,87828,12.61,166,6530000000.0,109.0,33.66723,1958564000000.0,2222524000000.0,2417508000000.0,2209484000000.0,2144936000000.0,2306974000000.0,2097929000000.0,2153226000000.0,2173256000000.0,1845428000000.0
South Korea,10,27655,27445,328488,61531,11.88,155,11007000000.0,221.0,2.279353,1053217000000.0,1172614000000.0,1047339000000.0,943941900000.0,1143672000000.0,1253290000000.0,1278047000000.0,1370633000000.0,1484489000000.0,1466039000000.0


In [67]:
full_merge.shape

(15, 20)

### Function

In [64]:
def answer_one():

    # Loaded the energy data from the file Energy Indicators.xls with footer and header were excluded
    energy = pd.read_excel('Energy Indicators.xls', skiprows = 17, skipfooter = 38)

    # First two columns were dropped
    energy = energy.iloc[:, 2:]

    # Changed the column labels
    energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    # Missing data were reflected as np.NaN values
    # Since missing data have occured only in columns that should be numeric I used pd.numeric()
    for column in energy.columns[1:]:
        energy[column] = pd.to_numeric(energy[column], errors = 'coerce')
    
    # Converted Energy Supply to gigajoules
    energy['Energy Supply'] = energy['Energy Supply']*1000000

    # Removed numbers and text in the parentheses for Country names which have contained them
    energy['Country'] = energy['Country'].str.replace('\\d','', regex = True).str.replace(' [(].*', '', regex=True)

    # Renamed some coutries
    countries_mapping = {
    'Republic of Korea': 'South Korea',
    'United States of America': 'United States',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'China, Hong Kong Special Administrative Region': 'Hong Kong'
    }
    energy['Country'] = energy['Country'].replace(countries_mapping)

    # Loaded GDP data and skipped header
    GDP = pd.read_csv('API_NY.GDP.MKTP.CD_DS2_en_csv_v2_19294.csv', skiprows = 4)

    # Renamed the following list of countries
    countries_mapping_gdp = {    
    'Korea, Rep.': 'South Korea',
     'Iran, Islamic Rep.': 'Iran',
     'Hong Kong SAR, China': 'Hong Kong'}
    
    GDP['Country Name'] = GDP['Country Name'].replace(countries_mapping_gdp)

    # Loaded the "Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology"
    ScimEn = pd.read_excel('scimagojr.xlsx')

    # Choosing only the 10 years (2006-2015) of GDP data and column 'Country'
    for column in GDP.columns:
        if (column.isdigit() and (int(column) < 2006 or int(column) > 2015)) or (not column.isdigit() and column != 'Country Name') :
            GDP = GDP.drop(columns = column)

    # Joined the three datasets: Energy, GDP, and ScimEn, using the top 15 countries by Scimagojr
    GDP = GDP.rename(columns={'Country Name': 'Country'})
    
    merged = pd.merge(
    pd.merge(ScimEn.iloc[:15], energy, on='Country'), 
    GDP, 
    on='Country'
    )

    # Dropped 'Region' since this collumn wasn't mentioned in the list of expexted columns
    merged = merged.drop(columns = 'Region')

    # Set country names as index
    merged = merged.set_index(['Country'])
    return merged

In [66]:
df = answer_one()
print('Shape of the result DataFrame: ',df.shape)
df.head()

Shape of the result DataFrame:  (15, 20)


Unnamed: 0_level_0,Rank,Documents,Citable documents,Citations,Self-citations,Citations per document,H index,Energy Supply,Energy Supply per Capita,% Renewable,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
China,1,273437,272374,2336764,1615239,8.55,245,127191000000.0,93.0,19.75491,2752119000000.0,3550328000000.0,4594337000000.0,5101691000000.0,6087192000000.0,7551546000000.0,8532185000000.0,9570471000000.0,10475620000000.0,11061570000000.0
United States,2,175891,172431,2230544,724472,12.68,363,90838000000.0,286.0,11.57098,13815580000000.0,14474230000000.0,14769860000000.0,14478070000000.0,15048970000000.0,15599730000000.0,16253970000000.0,16880680000000.0,17608140000000.0,18295020000000.0
India,3,55082,53775,463165,162944,8.41,181,33195000000.0,26.0,14.96908,940259900000.0,1216736000000.0,1198895000000.0,1341888000000.0,1675616000000.0,1823052000000.0,1827638000000.0,1856722000000.0,2039126000000.0,2103588000000.0
Japan,4,50523,50065,488062,119930,9.66,193,18984000000.0,149.0,10.23282,4601663000000.0,4579751000000.0,5106679000000.0,5289493000000.0,5759072000000.0,6233147000000.0,6272363000000.0,5212328000000.0,4896994000000.0,4444931000000.0
United Kingdom,5,43389,42284,615670,111290,14.19,226,7920000000.0,124.0,10.60047,2708442000000.0,3090510000000.0,2929412000000.0,2412840000000.0,2485483000000.0,2663806000000.0,2707090000000.0,2784854000000.0,3064708000000.0,2927911000000.0


### Question 2
What is the average GDP over the last 10 years for each country? (exclude missing values from this calculation.)

*This function should return a Series named `avgGDP` with 15 countries and their average GDP sorted in descending order.*

In [None]:
def answer_two():
    Top15 = answer_one()
    avgGDP = Top15.loc[:, '2006':].mean(axis=1).sort_values(ascending=False) # This method already ignores NaN values bu default
    return avgGDP

In [91]:
answer_two()

Country
United States         1.572243e+13
China                 6.927707e+12
Japan                 5.239642e+12
Germany               3.590729e+12
United Kingdom        2.777505e+12
France                2.692000e+12
Italy                 2.152983e+12
Brazil                1.988889e+12
Russian Federation    1.666746e+12
Canada                1.616359e+12
India                 1.602352e+12
Spain                 1.406644e+12
South Korea           1.221328e+12
Australia             1.207997e+12
Iran                  4.567516e+11
dtype: float64

### Question 3
By how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP?

*This function should return a single number.*

In [102]:
def answer_three():
    Top15 = answer_one()

    return Top15.loc[answer_two().index[5], '2015'] - Top15.loc[answer_two().index[5], '2006']

In [103]:
answer_three()

124621907951.68018

### Question 4

Create a new column that is the ratio of Self-Citations to Total Citations. 
What is the maximum value for this new column, and what country has the highest ratio?

*This function should return a tuple with the name of the country and the ratio.*

In [None]:
def answer_four():
    Top15 = answer_one()
    cit_ratio = (Top15['Self-citations']/Top15['Citations']).sort_values(ascending=False)

    return (cit_ratio.index[0],cit_ratio.iloc[0])


In [140]:
answer_four()


('China', 0.6912289816173135)

### Question 5

Create a column that estimates the population using Energy Supply and Energy Supply per capita. 
What is the third most populous country according to this estimate?

*This function should return a single string value.*

In [145]:
def answer_five():
    Top15 = answer_one()
    population = (Top15['Energy Supply']/Top15['Energy Supply per Capita']).sort_values(ascending=False)
    return population.index[2]

In [146]:
answer_five()

'United States'

### Question 6
Create a column that estimates the number of citable documents per person. 
What is the correlation between the number of citable documents per capita and the energy supply per capita? Use the `.corr()` method, (Pearson's correlation).

*This function should return a single number.*


In [169]:
def answer_six():
    Top15 = answer_one()
    Top15['Citable Documents per Capita'] = Top15['Citable documents']/(Top15['Energy Supply']/Top15['Energy Supply per Capita'])
    
    return Top15[['Citable Documents per Capita','Energy Supply per Capita']].corr().iloc[0,1] # Pearson's correlation is a default method


In [170]:
answer_six()

0.7434709127726777

### Question 7
Use the following dictionary to group the Countries by Continent, then create a dateframe that displays the sample size (the number of countries in each continent bin), and the sum, mean, and std deviation for the estimated population of each country.

```python
ContinentDict  = {'China':'Asia', 
                  'United States':'North America', 
                  'Japan':'Asia', 
                  'United Kingdom':'Europe', 
                  'Russian Federation':'Europe', 
                  'Canada':'North America', 
                  'Germany':'Europe', 
                  'India':'Asia',
                  'France':'Europe', 
                  'South Korea':'Asia', 
                  'Italy':'Europe', 
                  'Spain':'Europe', 
                  'Iran':'Asia',
                  'Australia':'Australia', 
                  'Brazil':'South America'}
```

*This function should return a DataFrame with index named Continent `['Asia', 'Australia', 'Europe', 'North America', 'South America']` and columns `['size', 'sum', 'mean', 'std']`*

In [None]:
def answer_seven():
    Top15 = answer_one()
    
    Top15['Population'] = Top15['Energy Supply']/Top15['Energy Supply per Capita']
    
    ContinentDict  = {'China':'Asia', 
                  'United States':'North America', 
                  'Japan':'Asia', 
                  'United Kingdom':'Europe', 
                  'Russian Federation':'Europe', 
                  'Canada':'North America', 
                  'Germany':'Europe', 
                  'India':'Asia',
                  'France':'Europe', 
                  'South Korea':'Asia', 
                  'Italy':'Europe', 
                  'Spain':'Europe', 
                  'Iran':'Asia',
                  'Australia':'Australia', 
                  'Brazil':'South America'}

    Top15['Continent'] = Top15.index.to_series().replace(ContinentDict)

    continet_population = Top15.groupby('Continent')['Population'].agg(['size', 'sum', 'mean', 'std'])
    continet_population['std'] = continet_population['std'].fillna(0)
    
    return continet_population

In [217]:
answer_seven()

Unnamed: 0_level_0,size,sum,mean,std
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,5,2898666000.0,579733300.0,679097900.0
Australia,1,23316020.0,23316020.0,0.0
Europe,6,457929700.0,76321610.0,34647670.0
North America,2,352855200.0,176427600.0,199669600.0
South America,1,205915300.0,205915300.0,0.0
