In [1]:
# Load Modules
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from math import pi



# Bokeh Library imports
import holoviews as hv
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.palettes import d3
from bokeh.transform import factor_cmap
from bokeh.transform import jitter
from bokeh.layouts import column, row
from bokeh.models import CustomJS, Slider

output_notebook()

In [2]:
# jupyter nbconvert Analysis.ipynb --to html --no-input

## Guided Question 1 

In [3]:
# Open Dataset and display info
clean_data = pd.read_csv('gapminder_clean.csv')
clean_data.head(11)

Unnamed: 0.1,Unnamed: 0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)",pop,continent,gdpPercap
0,0,Afghanistan,1962,,0.073781,21.276422,,,4.878051,7.45,,9.349593,,,33.219902,14.312061,,10267083.0,Asia,853.10071
1,1,Afghanistan,1967,,0.123782,9.917662,,,6.772908,7.45,,14.209827,,,35.389415,15.881812,,11537966.0,Asia,836.197138
2,2,Afghanistan,1972,,0.13082,18.880833,,,14.763231,7.45,,18.10585,,,37.610146,17.947027,,13079460.0,Asia,739.981106
3,3,Afghanistan,1977,,0.183118,13.836822,,,11.662904,7.449,,14.823175,,,40.110146,19.998926,,14880372.0,Asia,786.11336
4,4,Afghanistan,1982,,0.165879,,,,,7.45,,,,,43.230732,19.402324,,12881816.0,Asia,978.011439
5,5,Afghanistan,1987,,0.27556,,,,,7.461,,,,,47.296341,17.366559,,13867957.0,Asia,852.395945
6,6,Afghanistan,1992,,0.101375,,,,,7.502,,,,,51.362927,21.054483,,16317921.0,Asia,649.341395
7,7,Afghanistan,1997,,0.060798,,,,,7.636,,,,,54.017829,27.623273,,22227415.0,Asia,635.341351
8,8,Afghanistan,2002,38.47194,0.041129,,,,32.386719,7.273,,65.287704,23.714097,,55.857195,32.912231,37.813963,25268405.0,Asia,726.734055
9,9,Afghanistan,2007,30.622854,0.087858,0.535181,,,17.823714,6.437,13.740205,58.350047,27.344703,22.382016,57.833829,39.637202,42.032443,31889923.0,Asia,974.580338


## Guided Question 2

In [4]:
# Filter data for 1962
data_1962 = clean_data[clean_data['Year'] == 1962]


# Initialise Plot
plot = figure(plot_width = 500, plot_height = 500, y_axis_type='log', x_axis_type='log', x_range=[0.01, 70], y_range=[100, 10 ** 5.3])
plot.title = 'The Correlation Between CO2 Emissions and GDP per Capita in 1962'
plot.yaxis.axis_label = 'Log(GDP per Capita)'
plot.xaxis.axis_label = 'Log(CO2 emissions (metric tons per capita))'
plot.ygrid.grid_line_alpha = 1
plot.xgrid.grid_line_alpha = 1


# Column Data Source
source = ColumnDataSource(data = {'emissions' : data_1962['CO2 emissions (metric tons per capita)'],
                                  'GDPpercap' : data_1962['gdpPercap'],
                                  'labels' : data_1962['Country Name']
                                 }
                         )


# Hover tool
hover = HoverTool(
        tooltips=[
            ("country", "@labels"),
            ("CO2 Emissions", "@emissions{0.00}"),
            ("GDP per Capita", "@GDPpercap{00.00}")
        ]
    )
                               

# Plot
plot.scatter('emissions', 'GDPpercap', size = 4, source=source)
plot.add_tools(hover)


show(plot)

## Guided Question 3

In [5]:
# Filter countries from dataset with NaN values for emissions or GDP per Cap
df = data_1962[data_1962['CO2 emissions (metric tons per capita)'].notnull() & data_1962['gdpPercap'].notnull()]

# Pearsons Rank Correlation test
x = df['CO2 emissions (metric tons per capita)']
y = df['gdpPercap']
correlation, Pvalue = pearsonr(x, y)
print("Pearson's r: {corr} (p value: {p})".format(corr = correlation, p = Pvalue))

Pearson's r: 0.9260816725019472 (p value: 1.1286792210038754e-46)


## Guided Question 4

In [6]:
df = clean_data[clean_data['CO2 emissions (metric tons per capita)'].notnull() & clean_data['gdpPercap'].notnull()]
stat = {}

# Collect data for each year
gdp = df.groupby('Year')['gdpPercap'].apply(list)
ems = df.groupby('Year')['CO2 emissions (metric tons per capita)'].apply(list)

# Run stats test for each year and generate dictionary
[stat.update({year : pearsonr(x, y)}) for year, x, y in zip(clean_data['Year'].unique(),ems, gdp)]

#
df = pd.DataFrame(stat,index = ['Correlation', 'p-value']).transpose()
df.sort_values(by = 'Correlation', ascending = False, inplace = True)
df

Unnamed: 0,Correlation,p-value
1967,0.938792,3.397143e-53
1962,0.926082,1.128679e-46
1972,0.842899,1.824292e-32
1982,0.816638,5.565916e-29
1987,0.809553,3.899627e-28
1992,0.809432,1.610614e-29
1997,0.80814,7.976156e-30
2002,0.800642,3.8635640000000003e-29
1977,0.792834,2.838892e-26
2007,0.720417,9.232747e-22


## Guided Question 5

In [7]:
# Filter countries from dataset with NaN values for emissions or GDP per Cap

data_1967 = clean_data[clean_data['Year'] == 1967]
data_1967 = data_1967[data_1967['CO2 emissions (metric tons per capita)'].notnull() & data_1967['gdpPercap'].notnull()]

# Intitialise Plot
plot = figure(plot_width = 750, plot_height = 750, y_axis_type='log', x_axis_type='log')
plot.xaxis.axis_label = 'CO2 emissions (metric tons per capita)'
plot.yaxis.axis_label = 'GDP per Capita'
plot.title = 'The Correlation between CO2 emissions and GDP per Capita in 1967'


#Data Source Column
source = ColumnDataSource(data = {'emissions' : data_1967['CO2 emissions (metric tons per capita)'],
                                  'GDPperCap' : data_1967['gdpPercap'],
                                  'countries' : data_1967['Country Name'], 
                                  'continent' : data_1967['continent'],
                                  'relative size' : (data_1967['pop']/data_1967['pop'].max())*500
                                 }
                         )


# Hover Tool 
hover = HoverTool(
        tooltips=[
            ("Country", "@countries"),
            ("CO2 Emissions", "@emissions{OO.OO}"),
            ("GDP per Capita", '@GDPperCap{OO.OO}')
        ]
    )


# Plot
plot.scatter(x ='emissions', 
             y= 'GDPperCap', 
             source = source,
             fill_color = factor_cmap('continent', factors = data_1967['continent'].unique(), palette = d3['Category10'][5] ),
            line_color = 'black',
            line_width = 0.75,
             line_alpha = 1,
            size = 'relative size',
            legend_group = 'continent',
             alpha = 0.5
            )

plot.add_tools(hover)
plot.legend.location = 'top_left'
plot.legend.title = 'Continent'
plot.legend.title_text_font_style = "bold"
plot.legend.title_text_font_size = "20px"
plot.legend.label_text_font_size = "16px"


show(plot)

# Unguided Question 1
Energy use by continent, measured as Kg of oil equivalent per capita. Insufficient Data was available prior to 1972.

In [18]:
# Filter null values from Energy use column and assign nan continent to misc for removal
energy_use = clean_data[clean_data['Energy use (kg of oil equivalent per capita)'].notnull()].copy()
energy_use['continent'].fillna('Misc', inplace = True)
energy_use = energy_use[(energy_use['Year'] != 1962) & (energy_use['Year'] != 1967) & (energy_use['continent'] != 'Misc')]

# Initialise Plot
continents = energy_use['continent'].unique()

plot = figure(plot_width=900, plot_height=600, y_range=continents)
plot.xaxis.axis_label = 'Energy use (kg of oil equivalent per capita)'
plot.yaxis.axis_label = 'Continent'





# Interativelt generate plot by Year - allows datatset filter by year on plot
years = energy_use['Year'].unique()
for year in years:
    df = energy_use[energy_use['Year'] == year]
    
    source = ColumnDataSource(data = {'energy' : df['Energy use (kg of oil equivalent per capita)'],
                                  'countries' : df['Country Name'], 
                                  'continent' : df['continent'],
                                  'year' : df['Year']
                                 }
                         )
    
    plot.circle(x='energy', 
            y = jitter('continent', width = 0.5, range=plot.y_range), 
            source=source,
            fill_color = factor_cmap('continent', factors = energy_use['continent'].unique(), palette = d3['Category10'][6]),
            legend_label = str(year),
            line_color = 'black',
                radius = 100
           )

    
# Hover tool for Data points
hover = HoverTool(
        tooltips=[
            ("Country", "@countries"),
            ("Year", '@year'),
            ("Energy use", "@energy{00.00}")
        ]
    )

plot.add_tools(hover)
plot.legend.title = 'Year'
plot.legend.click_policy="hide" 
plot.legend.title_text_font_style = "bold"
show(plot)               

Oneway ANOVA was used to compare the average energy use of each continent from 1972-2007. The results for each year show there was always a signficicant difference in the energy use between different continents (see table below).

In [19]:
# Create Multiindex of Energy use for each continent by year
EU_series = (energy_use.groupby(['continent', 'Year'])['Energy use (kg of oil equivalent per capita)']).apply(np.asarray)


#Create dictionary for ANOVA statistics for each year, calculate values and display in DataFrame
stat = {}
[stat.update({year: f_oneway(*EU_series[:,year])}) for year in energy_use['Year'].unique()]
df = pd.DataFrame(stat, index = ['F-statistic', 'P-value'])
df.transpose()

Unnamed: 0,F-statistic,P-value
1972,7.366889,3.6e-05
1977,8.455679,8e-06
1982,7.551775,2.7e-05
1987,8.495935,7e-06
1992,7.897726,1.5e-05
1997,7.05732,4.9e-05
2002,6.637304,9e-05
2007,6.328448,0.000131
