In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

In [3]:
# Bokeh Library imports
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

output_notebook()

In [4]:
# open gapminder dataset
clean_data = pd.read_csv('gapminder_clean.csv')
clean_data.set_index('Unnamed: 0')

Unnamed: 0_level_0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)",pop,continent,gdpPercap
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Afghanistan,1962,,0.073781,21.276422,,,4.878051,7.450,,9.349593,,,33.219902,14.312061,,10267083.0,Asia,853.100710
1,Afghanistan,1967,,0.123782,9.917662,,,6.772908,7.450,,14.209827,,,35.389415,15.881812,,11537966.0,Asia,836.197138
2,Afghanistan,1972,,0.130820,18.880833,,,14.763231,7.450,,18.105850,,,37.610146,17.947027,,13079460.0,Asia,739.981106
3,Afghanistan,1977,,0.183118,13.836822,,,11.662904,7.449,,14.823175,,,40.110146,19.998926,,14880372.0,Asia,786.113360
4,Afghanistan,1982,,0.165879,,,,,7.450,,,,,43.230732,19.402324,,12881816.0,Asia,978.011439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2602,Zimbabwe,1987,14.407528,1.598217,74.161607,878.072691,896.673611,24.015710,5.784,1.150737,21.274886,32.451242,7.189361,61.753805,24.649495,53.141236,9216418.0,Africa,706.157306
2603,Zimbabwe,1992,7.413793,1.533724,43.120518,778.695133,923.493407,27.227263,4.840,-9.015570,36.485231,40.862069,-14.129659,56.491976,28.485762,51.724135,10704340.0,Africa,693.420786
2604,Zimbabwe,1997,18.934082,1.194678,63.058320,870.988697,804.508892,37.595273,4.237,2.680594,44.609791,25.554678,-2.879048,46.065902,31.174507,55.511236,11404948.0,Africa,792.449960
2605,Zimbabwe,2002,14.029007,0.942795,164.559047,827.329873,772.676619,31.834799,4.018,-8.894023,34.972553,,2.712950,40.679146,32.807111,,11926563.0,Africa,672.038623


# Python for Data Science Training Project
## Guided Question 2 & 3
The plot below shows the correlation between CO2 emmission and GDP per capita in 1962. A Pearson's correlation was run to determine the relationship between 107 countries' CO2 emissions and GDP per capita in 1962. There was a strong positive correlation between CO2 emmissions and GDP per capita in 1962 (r = 0.926, N= 107, p < 1e-45). The values for individual countries can be viewed by hovering the mouse over a data point.

Due to anomalous values, the data for Kuwait has been ommitted.


In [39]:
# Filter data for the year 1962 and removing NaN values
filtered_data_1962 = clean_data[clean_data['Year'] == 1962]
filtered_data_1962 = filtered_data_1962.dropna(how = 'any', subset = ['CO2 emissions (metric tons per capita)', 'gdpPercap'])

# Kuwait was dropped due to an anomalous value
filtered_data_1962 = filtered_data_1962[filtered_data_1962['Country Name'] != 'Kuwait']

# Interactive Plot of CO2 emissions (metric tons per capita) and GDP per Capita

source = ColumnDataSource(data = dict(x = filtered_data_1962['CO2 emissions (metric tons per capita)'],
                                      y = filtered_data_1962['gdpPercap'],
                                      labels = filtered_data_1962['Country Name']
                                     )
                         )

hover = HoverTool(
        tooltips=[
            ("country", "@labels"),
            ("CO2 Emissions", "@x"),
            ("GDP per Capita", '@y')
        ]
    )
                               
p = figure(plot_width = 500, plot_height = 500, tools = [hover])
p.xaxis.axis_label = 'CO2 emissions (metric tons per capita)'
p.yaxis.axis_label = 'GDP per Capita'

p.scatter('x', 'y', size = 4, source=source)
show(p)

# Pearsons Rank Correlation test
correlation, Pvalue = pearsonr(x, y)

print("Pearson's r: {corr} (p value: {p})".format(corr = correlation, p = Pvalue))

Pearson's r: 0.7204168835195922 (p value: 9.232746580444968e-22)


## Guided Q4
A Pearson's correlation was run to determine the relationship between 107 countries' CO2 emissions and GDP per capita for the following years: 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002 and 2007. The table below shows the pearson coefficient and associated value for the correlation between CO2 emissions and GDP per capita each year. The strongest correlation between CO2 emissions and GDP per capita was in 1967.

In [88]:


years = [1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007]

# remove NaN values from CO2 emmissions and GDP per cap
clean_data = clean_data.dropna(how = 'any', subset = ['CO2 emissions (metric tons per capita)', 'gdpPercap'])

# Calculate pearsons' coefficient for each year available

correlation = []
pvalues = []

for year in years:
    filtered_data = clean_data[clean_data['Year'] == year]
    x = filtered_data['CO2 emissions (metric tons per capita)']
    y = filtered_data['gdpPercap']
    corr, p = pearsonr(x, y)
    correlation.append(corr)
    pvalues.append(p)

    
# Create data frame of year & pearson coefficient
correlation_data = {'Correlation' : correlation, 'Year' : years, 'p value' : pvalues}
df_correlation_data = pd.DataFrame(correlation_data, columns =['Year','Correlation', 'p value'])

df_correlation_data.set_index('Year', inplace = True)
df_correlation_data .sort_values(by = 'Correlation', ascending = False, inplace = True)
df_correlation_data

Unnamed: 0_level_0,Correlation,p value
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1967,0.938792,3.397143e-53
1962,0.926082,1.128679e-46
1972,0.842899,1.824292e-32
1982,0.816638,5.565916e-29
1987,0.809553,3.899627e-28
1992,0.809432,1.610614e-29
1997,0.80814,7.976156e-30
2002,0.800642,3.8635640000000003e-29
1977,0.792834,2.838892e-26
2007,0.720417,9.232747e-22
