In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn
import numpy as np

%matplotlib inline

seaborn.set_context('notebook', font_scale=1.4)

data = pd.read_csv('gapminder_all.csv', index_col='country')
data = data.drop(data.gdpPercap_1957.argmax())

In [None]:
def scientific_notation(value, precision=1):
    order = int(np.log10(value))
    digits = round(value / 10**order, precision)
    if precision == 0:
        digits = int(digits)
    return str(digits) + "e" + str(order)

assert scientific_notation(26.69, 2) == '2.67e1'

In [None]:
fig, ax = plt.subplots()

for continent in data.continent.unique():
    d = data[data.continent == continent]
    if continent == 'Africa':
        color = 'blue'
    elif continent == 'Asia':
        color = 'green'
    elif continent == 'Europe':
        color = 'purple'
    else:
        color = 'grey'
    ax.scatter(d.gdpPercap_1957, d.lifeExp_1957,
               s=np.sqrt(d.pop_1957 / 1e4),
               c=color, alpha=0.5)

marks = []
labels = []
for pop in np.logspace(6, 8, num=3):
    marks.append(ax.scatter([], [],
                 c='grey',
                 s=np.sqrt(pop / 1e4),
                 label=pop))
    labels.append(scientific_notation(pop, precision=0))

fit = sm.OLS.from_formula('lifeExp_1957 ~ np.log(gdpPercap_1957)', data=data).fit()
xx = np.linspace(300, 15000)
ax.plot(xx, fit.params[0] + fit.params[1] * np.log(xx))


ax.legend(marks, labels, loc='lower right', title='Population')
ax.set_xlabel('GDP/Pop')
ax.set_ylabel('Life expectancy')
ax.set_title('Relationship between wealth and life-expectancy in 1957')

In [None]:
fit.summary()