In [None]:
import pandas as pd 
from IPython.display import display
import datasheets
import plotly as py
from plotly import tools
import plotly.graph_objs as go
from plotly_layout import *

py.offline.init_notebook_mode(connected=True)
test = True
PALETTE = [
    "#4e79a7",
    "#f28e2b",
    "#e15759",
    "#76b7b2",
    "#59a14f",
    "#edc948",
    "#b07aa1",
    "#ff9da7",
    "#9c755f",
    "#bab0ac",]

We are going to explore data from the United Nations with the following post as a guideline. 

_Hannah Ritchie and Max Roser (2018) - "Urbanization"._
</br>_Published online at OurWorldInData.org._
</br>[https://ourworldindata.org/urbanization](https://ourworldindata.org/urbanization)

## Current situation

The current urban and rural populations per country can be retrieved from the United Nations data portal as indicated in the article.

In [None]:
country_codes_URL = 'https://unstats.un.org/unsd/methodology/m49/'

In [None]:
root_url = 'https://population.un.org/wup/Download/Files/'

In [None]:
url = root_url + 'WUP2018-F01-Total_Urban_Rural.xls'

In [None]:
df_tur_raw = pd.read_excel(io=url)

In [None]:
# The first 15 lines are the headers of the Excel sheet
df_tur = df_tur_raw[15:].copy()
df_tur.columns = df_tur_raw.loc[14].str.replace(',', '').str.replace(' ', '_').str.replace('\n', '_').str.lower().tolist()
df_tur.drop(labels=['index', 'note', 'total', 'percentage_urban'], axis='columns', inplace=True)

In [None]:
df_tur['country_code'] = df_tur.country_code.astype(int)
# Population are by thousands
for c in ['urban', 'rural']:
    df_tur[c] = df_tur[c] * 1000
    df_tur[c] = df_tur[c].astype(int)

In [None]:
if test:
    df_tur.head(n=10).pipe(display)

In [None]:
def assign_groups(labels, groups):
    current_label = ''
    local_groups = []
    for label in labels:
        current_label = label if label in groups else current_label
        local_groups.append(current_label)
        
    return local_groups

In [None]:
regions = ['AFRICA', 'LATIN AMERICA AND THE CARIBBEAN', 'NORTHERN AMERICA', 'EUROPE', 'OCEANIA', 'ASIA']
df_tur['region'] = assign_groups(
    labels=df_tur.region_subregion_country_or_area.tolist(),
    groups=regions)

In [None]:
if test:
    df_tur[df_tur.country_code > 902][['region_subregion_country_or_area', 'country_code']].pipe(display)

In [None]:
# Extract
more_than_903 = df_tur.country_code > 902
in_regions = df_tur.region_subregion_country_or_area.isin(regions)
not_sub_regions = df_tur.region_subregion_country_or_area.str.match('.*(countries|Less|More)')

sub_regions = df_tur[more_than_903 & ~in_regions & ~not_sub_regions].region_subregion_country_or_area.tolist()
df_sub_regions = df_tur[
    df_tur.region_subregion_country_or_area.isin(sub_regions)
].groupby(by='region').region_subregion_country_or_area.apply(lambda s: ', '.join(s.tolist()))

if test:
    df_sub_regions.pipe(display)

In [None]:
df_tur['sub_region'] = assign_groups(
    labels=df_tur.region_subregion_country_or_area.tolist(),
    groups=sub_regions)
df_tur.loc[df_tur.region == 'NORTHERN AMERICA', 'sub_region'] = 'Northern America'

In [None]:
# Countries have a country code lower than 900
df_c = df_tur[df_tur.country_code < 900].copy()
df_c.rename(columns=dict(region_subregion_country_or_area='country'), inplace=True)

In [None]:
if test:
    df_c[~df_c.region.isin(regions)].pipe(display)
    df_c[df_c.sub_region.isnull()].pipe(display)
    df_c[df_c.country.isin(['France', 'Malaysia'])].pipe(display)

In [None]:
# We can now compute the total by region
if test:
    df_r = df_c.groupby(by='region').agg(
        dict(
            urban='sum',
            rural='sum'))

    df_rg = df_tur[
        df_tur.region_subregion_country_or_area.isin(regions)
    ][['urban', 'rural', 'region_subregion_country_or_area']].set_index('region_subregion_country_or_area')

    df_r.join(df_rg, rsuffix='_g').pipe(display)

## Evolution in time

In [None]:
urban_URL = root_url + 'WUP2018-F19-Urban_Population_Annual.xls'
rural_URL = root_url +'WUP2018-F20-Rural_Population_Annual.xls'

In [None]:
urban_raw_data = pd.read_excel(io=urban_URL)
rural_raw_data = pd.read_excel(io=rural_URL)

In [None]:
def clean_column_names(serie):
    cleaned_serie = serie.astype(
        str
    ).str.replace(
        ',', ''
    ).str.replace(
        ' ', '_'
    ).str.replace(
        '\n', '_'
    )
    return cleaned_serie.str.lower().tolist()

In [None]:
def clean_temporal_dataframe(df):
    ''''''
    df_ = df[15:].copy()
    df_.columns = df.loc[14].pipe(clean_column_names)
    df_ = df_.drop(
        labels=['index', 'note', 'country_code'],
        axis='columns'
    ).set_index(
        keys=['region_subregion_country_or_area']
    )
    # Map the years into integers
    df_.columns = df_.columns.map(float).map(int).unique()
    # Put the years as the index and the regions as the
    # columns
    df_ = df_.transpose()
    df_.index.name = 'year'
    
    return df_

In [None]:
urban_data = urban_raw_data.pipe(clean_temporal_dataframe)
rural_data = rural_raw_data.pipe(clean_temporal_dataframe)

In [None]:
def select_countries(df, countries=None):
    selection =  countries if countries else ['AFRICA', 'ASIA', 'EUROPE', 'LATIN AMERICA AND THE CARIBBEAN', 'NORTHERN AMERICA', 'OCEANIA']
    return df[selection]

In [None]:
urban_data_by_region = urban_data.pipe(select_countries)
rural_data_by_region = rural_data.pipe(select_countries)

In [None]:
data = [
    go.Scatter(
        x=[2018, 2050],
        y=[3.5, 3.5],
        line=dict(color='#dcdcdc', width=0),
        fill='tozeroy',
        mode='lines',
        showlegend=False)
]

for n, region in enumerate(urban_data_by_region.columns):
    data+= [
        go.Scatter(
            x=urban_data_by_region.index,
            y=urban_data_by_region[region].values / 1.e6,
            marker=dict(color=PALETTE[n]),
            name=region + ' (Urban)' if n == 0 else region,
            legendgroup=region
            )]
    data+= [
        go.Scatter(
            x=rural_data_by_region.index,
            y=rural_data_by_region[region].values / 1.e6,
            line=dict(dash='dash'),
            marker=dict(color=PALETTE[n]),
            name = region + ' (Rural)' if n == 0 else region,
            legendgroup=region,
            showlegend=True if n == 0 else False
            )]
    
labels = dict(
    title='Rural and urban population from 1950 to 2050',
    subtitle='The gray area corresponds to projections.',
    ylabel='population (in billions)',
    xlabel='')

axes = dict(
    xaxis=axis_no_title(showgrid=False),
    yaxis=axis_no_title(showgrid=False),
    legend=legend_dark(font_size=14))

layout = layout_by_line_height(
    **labels,
    **axes,
    left_margin=50,
    right_margin=300,)

figure = go.Figure(data=data, layout=layout)
py.offline.iplot(figure_or_data=figure, show_link=False)