In [12]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from math import pi



# Bokeh Library imports
import holoviews as hv
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.palettes import d3
from bokeh.transform import factor_cmap

output_notebook()

# Python for Data Science Training Project
## Guided Question 2 & 3
The plot below shows the correlation between CO2 emmission and GDP per capita in 1962. A Pearson's correlation was run to determine the relationship between 107 countries' CO2 emissions and GDP per capita in 1962. There was a strong positive correlation between CO2 emmissions and GDP per capita in 1962 (r = 0.926, N= 107, p < 1e-45). The values for an individual country can be viewed by mousing over its' data point.

Kuwaits' CO2 emissions were considered an outlier and have been ommitted.


In [2]:
# open gapminder dataset
clean_data = pd.read_csv('gapminder_clean.csv')
#clean_data.set_index('Unnamed: 0', inplace = True)


# Filter data for the year 1962 and removing NaN values
filtered_data_1962 = clean_data[clean_data['Year'] == 1962]
filtered_data_1962 = filtered_data_1962.dropna(how = 'any', subset = ['CO2 emissions (metric tons per capita)', 'gdpPercap'])


# Kuwait was dropped due to an anomalous value
filtered_data_1962 = filtered_data_1962[filtered_data_1962['Country Name'] != 'Kuwait']

# Initialise Plot
plot = figure(plot_width = 500, plot_height = 500)
plot.yaxis.axis_label = 'GDP per Capita'
plot.title = 'The Correlation Between CO2 Emissions and GDP per Capita in 1962'


# Column Data Source
source = ColumnDataSource(data = {'emissions' : filtered_data_1962['CO2 emissions (metric tons per capita)'],
                                  'GDPpercap' : filtered_data_1962['gdpPercap'],
                                  'labels' : filtered_data_1962['Country Name']
                                 }
                         )

# Hover tool
hover = HoverTool(
        tooltips=[
            ("country", "@labels"),
            ("CO2 Emissions", "@emissions{0.00}"),
            ("GDP per Capita", "@GDPpercap{00.00}")
        ]
    )
                               

# Plot
plot.scatter('emissions', 'GDPpercap', size = 4, source=source)
plot.add_tools(hover)
show(plot)


# Pearsons Rank Correlation test
x = filtered_data_1962['CO2 emissions (metric tons per capita)']
y = filtered_data_1962['gdpPercap']
correlation, Pvalue = pearsonr(x, y)
print("Pearson's r: {corr} (p value: {p})".format(corr = correlation, p = Pvalue))


Pearson's r: 0.8063294717615215 (p value: 1.0822253072449818e-25)


## Guided Q4
A Pearson's correlation was run to determine the relationship between 107 countries' CO2 emissions and GDP per capita for the following years: 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002 and 2007. The table below shows the pearson coefficient and associated value for the correlation between CO2 emissions and GDP per capita each year. The strongest correlation between CO2 emissions and GDP per capita was in 1967.

In [3]:
# years data was collected
years = clean_data['Year'].unique()

# remove NaN values from CO2 emmissions and GDP per cap
clean_data = clean_data.dropna(how = 'any', subset = ['CO2 emissions (metric tons per capita)', 'gdpPercap'])

# Calculate pearsons' coefficient for each year available

correlation = []
pvalues = []

for year in years:
    filtered_data = clean_data[clean_data['Year'] == year]
    x = filtered_data['CO2 emissions (metric tons per capita)']
    y = filtered_data['gdpPercap']
    corr, p = pearsonr(x, y)
    correlation.append(corr)
    pvalues.append(p)

    
# Create data frame with year, pearson coefficient & p-value
correlation_data = {'Correlation' : correlation, 'Year' : years, 'p value' : pvalues}
df_correlation_data = pd.DataFrame(correlation_data, columns =['Year','Correlation', 'p value'])

df_correlation_data.set_index('Year', inplace = True)
df_correlation_data .sort_values(by = 'Correlation', ascending = False, inplace = True)
df_correlation_data

Unnamed: 0_level_0,Correlation,p value
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1967,0.938792,3.397143e-53
1962,0.926082,1.128679e-46
1972,0.842899,1.824292e-32
1982,0.816638,5.565916e-29
1987,0.809553,3.899627e-28
1992,0.809432,1.610614e-29
1997,0.80814,7.976156e-30
2002,0.800642,3.8635640000000003e-29
1977,0.792834,2.838892e-26
2007,0.720417,9.232747e-22


## Guided Q5
The plot below shows the correlation between CO2 emmission and GDP per capita in 1967. There was a strong positive correlation between CO2 emmissions and GDP per capita in 1962 (r = 0.939, N= 107, p < 1e-50) .The size of the each data point represents the population of the country relative to the most populus country (China). The information for individual countries can be found by mousing over the data points.

Kuwaits' CO2 emissions were considered an outlier and have been ommitted.

In [173]:
# Filter data for the year 1967 and remove NaN values
filtered_data_1967 = clean_data[clean_data['Year'] == 1967]
filtered_data_1967 = filtered_data_1967.dropna(how = 'any', subset = ['CO2 emissions (metric tons per capita)', 'gdpPercap'])
filtered_data_1967 = filtered_data_1967[filtered_data_1967['Country Name'] != 'Kuwait']


# Intitialise Plot
plot = figure(plot_width = 750, plot_height = 750)
plot.xaxis.axis_label = 'CO2 emissions (metric tons per capita)'
plot.yaxis.axis_label = 'GDP per Capita'
plot.title = 'The Correlation between CO2 emissions and GDP per Capita in 1967'


#Data Source Column
source = ColumnDataSource(data = {'emissions' : filtered_data_1967['CO2 emissions (metric tons per capita)'],
                                  'GDPperCap' : filtered_data_1967['gdpPercap'],
                                  'countries' : filtered_data_1967['Country Name'], 
                                  'continent' : filtered_data_1967['continent'],
                                  'relative size' : (filtered_data_1967['pop']/filtered_data_1967['pop'].max())*500
                                 }
                         )


# Hover Tool 
hover = HoverTool(
        tooltips=[
            ("Country", "@countries"),
            ("CO2 Emissions", "@emissions{OO.OO}"),
            ("GDP per Capita", '@GDPperCap{OO.OO}')
        ]
    )


# Plot
plot.scatter('emissions', 'GDPperCap', source = source,
            fill_color = factor_cmap('continent', palette = d3['Category10'][5], 
            factors = filtered_data_1967['continent'].unique()),
            line_color = 'black',
            line_width = 0.75,
            size = 'relative size',
            legend_group = 'continent',
            )

plot.add_tools(hover)
plot.legend.location = 'top_left'
plot.legend.title = 'Continent'
plot.legend.title_text_font_style = "bold"
plot.legend.title_text_font_size = "20px"
plot.legend.label_text_font_size = "16px"


show(plot)

## Q1 - What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)'?
ONe-way ANOVA was used to compare the average energy use of each continents. There was a significant difference in the average energy use of each continents (f= 47.73, p = 0.05). Average energy use was measured in Kg of oil equivalent per capita and calculated using all available data for countries of each continent.

In [171]:
energy_stat = {}
energy_use = {}

# Collect Data for stats test and bar chart
for con in clean_data['continent'].unique():
    stat = clean_data.set_index('continent').loc[con]['Energy use (kg of oil equivalent per capita)'].dropna().to_numpy()
    mean = clean_data.set_index('continent').loc[con]['Energy use (kg of oil equivalent per capita)'].mean()
    energy_stat[con] = stat
    energy_use[con] = mean

# Create dataframe for bar chart
energy_use_df = pd.DataFrame(energy_use, index = [0]).transpose().reset_index()
energy_use_df.columns = ['Continent', 'Energy use (kg of oil equivalent per capita)']


# Columnn Data Source
source = ColumnDataSource(data = {'Continent' : energy_use_df['Continent'],
                                  'Energy' : energy_use_df['Energy use (kg of oil equivalent per capita)'],
                                  'colors' : d3['Category10'][5]
                                 }
                         )


#Hover Tool
hover = HoverTool(
        tooltips=[('Continent', '@Continent'),
                  ('Energy use', '@Energy{0.00}'),
                 ]
)

#Initialise Plot
plot = figure(plot_height=450, plot_width = 500, x_range = energy_use_df['Continent'])
plot.xaxis.major_label_orientation = pi/2
plot.xaxis.axis_label = 'Continent'
plot.yaxis.axis_label = 'Energy use (kg of oil equivalent per capita)'
plot.title = 'Average Energy Use of Each Continent between 1962 and 2007'

# Bar chart
plot.vbar(x='Continent', 
          top='Energy', 
          source=source, 
          width = 0.9, 
          color = 'colors',
          legend_field="Continent"
         )
        
plot.add_tools(hover)
plot.legend.location = (175, 175)
plot.legend.title = 'Continent'
plot.legend.title_text_font_style = "bold"


show(plot)

# ANOVA for energy use of each continent 
corr, Pvalue = f_oneway(energy_stat['Europe'], energy_stat['Asia'], energy_stat['Africa'], energy_stat['Americas'], energy_stat['Oceania'])
print("One_way ANOVA: {corr} (p value: {Pvalue})".format(corr = corr, Pvalue = Pvalue))

One_way ANOVA: 47.73283512030328 (p value: 3.987577302591487e-36)


## Q2 - Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990?

One-way ANOVA was used to compare the average imports of goods and services in Asia and Europe after 1990. There was no significant difference in the average imports of goods and services in Asia and Europe after 1990 (f = 2.06, p = 0.20).

In [133]:
continents = ['Europe', 'Asia']
AE_import_dict = {'Europe' : [], 'Asia' : []}

# Generate dataframe for 
Q2 = clean_data.loc[(clean_data['Year']> 1990) & (clean_data['continent'].isin(continents))]
Q2.set_index('continent', inplace = True)

for year in Q2['Year'].unique():
    temp_data = Q2[Q2['Year'] == year]
    for con in continents:
        mean = temp_data.loc[con]['Imports of goods and services (% of GDP)'].mean()
        AE_import_dict[con].append(mean)

AE_import = pd.DataFrame(AE_import_dict)
AE_import['Year'] = Q2['Year'].unique() 


# Initialise Plot
plot = figure(plot_width = 500, plot_height = 500)
plot.xaxis.axis_label = 'Year'
plot.yaxis.axis_label = 'Imports of goods and services (% of GDP)'
plot.title = 'Average Imports of goods and services in Asia and Europe after 1990'


# Column Data source
source = ColumnDataSource(data = {'Asia' : AE_import['Asia'],
                                  'Europe' :  AE_import['Europe'],
                                  'Year' : AE_import['Year']
                                 }
                         )


# Asia plot
plot.line('Year', 'Asia', source=source)
plot.circle('Year', 'Asia', source=source, size = 7, line_color = 'black', legend_label= 'Asia')


# Europe plot
plot.line('Year', 'Europe', source=source, color = 'red')
plot.circle('Year', 'Europe', source=source, size = 7, color = 'red', line_color = 'black', legend_label='Europe')


# Hover Tool 
hover = HoverTool(
        tooltips=[
            ("Asia", "@Asia{OO.OO}%"),
            ("Europe", "@Europe{OO.OO}%"),
        ]
    )


plot.add_tools(hover)
plot.legend.location = 'top_left'
show(plot)


# one-way ANOVA test comparing mean imports over time
corr, Pvalue = f_oneway(AE_import['Asia'], AE_import['Europe'])
print("One_way ANOVA: {corr} (p value: {Pvalue})".format(corr = corr, Pvalue = Pvalue))

One_way ANOVA: 2.062484786152866 (p value: 0.20098307809195454)


## Q3 - What is the country (or countries) that has the highest Population density across all years?
Signapore has the highest mean population density (4361 people per sq. km of land area) over the following years: 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002 and 2007. Population density was measured as people per sq. km of land area. Mean population density for each country was calculated using all available data. 
- For Botswana and Bangladesh no data was available before 1972. 
- For Boznia & Hergovina, Croatia, Czech Republic and Germany data was not available before 1992.
- For Eritrea no data was available before 1997.

In [20]:
df = clean_data.set_index('Country Name')
mean_pd = {}

#Calculate mean population density for each country
for country in clean_data['Country Name'].unique():
    mean = df.loc[country]['Population density (people per sq. km of land area)'].mean()
    mean_pd[country] = mean
    
mean_popd = pd.DataFrame(mean_pd, index=[0]).transpose().reset_index()
mean_popd.columns = ["Country", "Population Density"]
mean_popd['rank'] = mean_popd["Population Density"].rank(ascending = False)


# Categorical factors
r = 1000


# DataColumnSource
source = ColumnDataSource(data = {'Country':mean_popd['Country'],
                                  'PopD':mean_popd['Population Density'],
                                  'Rank':mean_popd['rank']
                                 }
                         )


# Hover Tool
hover = HoverTool(
        tooltips=[
            ("Country", "@Country"),
            ("Mean Population Density", "@PopD{00.0O}"),
            ("Rank", '@Rank')
        ]
    )


#Set the figure
plot = figure(x_range=mean_popd['Country'], plot_height=450, plot_width = 1250)
plot.xaxis.major_label_orientation = pi/2
plot.xaxis.axis_label = 'Country'
plot.yaxis.axis_label = 'Mean Population density'
plot.title = 'Mean Population density of Countries between 1962-2007'


# Plot bar chart
plot.vbar(x='Country', top='PopD', width=0.75, source=source)
plot.add_tools(hover)


show(plot)

## Q4 - What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?

Mali had the greatest change in life expectancy from 1962 with a 90.070 % increase in life expectancy.  The increase in life expectancy at birth for each country was calculated as a percentage. Percentage change for countries was ranked in descending order. The first recorded value for life expectancy (I) and life expectancy recorded in 2007 (F) were used to calculate percentage change with the following formula: Percentage Change = ((F - I) / I)*100
                
                                                                  
It should be noted data was not available for all countries:
- For Botswana and Bangladesh no data was available before 1972. 
- For Boznia & Hergovina, Croatia, Czech Republic and Germany data was not available before 1992.
- For Eritrea no data was available before 1997.
- No data was availavle for Lesotho, Montenegro or Serbia

 


In [19]:
# Q4 Absolute difference
life_exp = {}

for country in clean_data['Country Name'].unique():
    data = clean_data[clean_data['Country Name'] == country]
    data.reset_index(inplace = True)
    initial_exp = data.iloc[0]['Life expectancy at birth, total (years)']
    final_exp = data.iloc[-1]['Life expectancy at birth, total (years)']
    
    initial_year = data.iloc[0]['Year']
    final_year = data.iloc[-1]['Year']
        
    if initial_year == final_year:
        continue
    
    life_exp[country] = [country, 
                         initial_exp, 
                         final_exp, 
                         final_exp - initial_exp, 
                         ((final_exp - initial_exp)/initial_exp)*100, 
                         initial_year, 
                         final_year]
        
    
life_exp_df = pd.DataFrame(life_exp)
life_exp_df = life_exp_df.transpose()
life_exp_df.columns = ['Country', 'Initial Life Expectancy', 'Final Life Expectancy', 'Change in Life Expectancy', 'Percentage Change', 'first date', 'last date']
life_exp_df['rank'] = life_exp_df["Percentage Change"].rank(ascending = False)

# Columnn Data Source
source = ColumnDataSource(data = {'Country' : life_exp_df['Country'],
                                  'Initial' : life_exp_df['Initial Life Expectancy'],
                                  'Final' : life_exp_df['Final Life Expectancy'],
                                  'percentage' : life_exp_df['Percentage Change'],
                                  'rank' : life_exp_df['rank'] 
                                 }
                         )


#Hover Tool
hover = HoverTool(
        tooltips=[('Country', '@Country'),
                  ('Rank', '@rank'),
                  ('Percentage Change', '@percentage{OO.OO}%'),
                  ('Change in Life Expectancy', '@Initial{00.0O} -> @Final{00.0O}')
                  
                 ]
)

#Initialise Plot
plot = figure(plot_height=450, plot_width = 1250, x_range=life_exp_df['Country'])
plot.xaxis.major_label_orientation = pi/2
plot.xaxis.axis_label = 'Country'
plot.yaxis.axis_label = 'Percentage Difference in Life Expectancy (Years)'

# Bar chart
plot.vbar(x='Country', top='percentage', source=source, width = 0.75)
plot.add_tools(hover)
show(plot)



In [None]:
# jupyter nbconvert Analysis.ipynb --to html --no-input