# Animating [Worldwide Population Data](https://www.kaggle.com/datasets/shivd24coder/worldwide-population-data)

In [1]:
# Start of by importing some of the basic stuff

%matplotlib qt
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.animation as anime
import pandas as pd
import numpy as np

In [2]:
# Read in the dataset
df = pd.read_csv('data/WPP2022_Demographic_Indicators_Medium.csv')

  df = pd.read_csv('data/WPP2022_Demographic_Indicators_Medium.csv')


In [11]:
# Soo many columns in the table 🙀
df.columns

Index(['SortOrder', 'LocID', 'Notes', 'ISO3_code', 'ISO2_code', 'SDMX_code',
       'LocTypeID', 'LocTypeName', 'ParentID', 'Location', 'VarID', 'Variant',
       'Time', 'TPopulation1Jan', 'TPopulation1July', 'TPopulationMale1July',
       'TPopulationFemale1July', 'PopDensity', 'PopSexRatio', 'MedianAgePop',
       'NatChange', 'NatChangeRT', 'PopChange', 'PopGrowthRate',
       'DoublingTime', 'Births', 'Births1519', 'CBR', 'TFR', 'NRR', 'MAC',
       'SRB', 'Deaths', 'DeathsMale', 'DeathsFemale', 'CDR', 'LEx', 'LExMale',
       'LExFemale', 'LE15', 'LE15Male', 'LE15Female', 'LE65', 'LE65Male',
       'LE65Female', 'LE80', 'LE80Male', 'LE80Female', 'InfantDeaths', 'IMR',
       'LBsurvivingAge1', 'Under5Deaths', 'Q5', 'Q0040', 'Q0040Male',
       'Q0040Female', 'Q0060', 'Q0060Male', 'Q0060Female', 'Q1550',
       'Q1550Male', 'Q1550Female', 'Q1560', 'Q1560Male', 'Q1560Female',
       'NetMigrations', 'CNMR'],
      dtype='object')

Columns of interest TPopulation1Jan, Location, LocTypeName

In [17]:
df.Location.unique()[:20]

array(['World', 'Sub-Saharan Africa', 'Northern Africa and Western Asia',
       'Central and Southern Asia', 'Eastern and South-Eastern Asia',
       'Latin America and the Caribbean',
       'Oceania (excluding Australia and New Zealand)',
       'Australia/New Zealand', 'Europe and Northern America',
       'More developed regions', 'Less developed regions',
       'Least developed countries',
       'Less developed regions, excluding least developed countries',
       'Less developed regions, excluding China',
       'Land-locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)', 'High-income countries',
       'Middle-income countries', 'Upper-middle-income countries',
       'Lower-middle-income countries'], dtype=object)

Those aren't countries 🤔

In [4]:
df.LocTypeName.unique()

array(['World', 'SDG region', 'Development group', 'Income group',
       'Geographic region', 'Subregion', 'Country/Area'], dtype=object)

Lets focus on stats for `Country/Area`

In [3]:
# For this demo I'm only interested in the population stats by country
df2 = df.loc[df.LocTypeName=='Country/Area']

In [13]:
df2.Location.unique()[:20]

array(['Burundi', 'Comoros', 'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya',
       'Madagascar', 'Malawi', 'Mauritius', 'Mayotte', 'Mozambique',
       'Réunion', 'Rwanda', 'Seychelles', 'Somalia', 'South Sudan',
       'Uganda', 'United Republic of Tanzania', 'Zambia', 'Zimbabwe'],
      dtype=object)

That's better

In [7]:
df2.Time.iat[0], df2.Time.iat[-1]

(1950, 2101)

In [8]:
df2.Time.unique()

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026,
       2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037,
       2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048,
       2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059,
       2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070,
       2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081,
       2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092,
       2093, 2094, 2095, 2096, 2097, 2098, 2099, 21

Lets look at the population onn the first year `1950`

In [9]:
fig, ax = plt.subplots()

def topPopulations(year, present_year):
    n=10
    ax.clear()
    ax.set_title(f'{"Prediction: " if present_year < year else ""}{year}')
    sns.barplot(ax=ax, data=df2.loc[df2.Time==year, ['TPopulation1Jan','Location']].sort_values(by='TPopulation1Jan', ascending=False).iloc[:n], x='TPopulation1Jan', y='Location')
    fig.tight_layout()

ani = anime.FuncAnimation(
    fig=fig,
    func=topPopulations,
    frames=df2.Time.unique(),
    fargs=(2023,)
)

In [10]:
from tqdm.notebook import tqdm
ani = anime.FuncAnimation(
    fig=fig,
    func=topPopulations,
    frames=tqdm(df2.Time.unique()),
    fargs=(2023,),
    interval=200
)
ani.save('population.mp4')

  0%|          | 0/152 [00:00<?, ?it/s]