In [69]:
#!pip install datapackage
#!pip install matplotli
#!pip install seaborn
#!pip install requests
#!pip install json

In [70]:
import matplotlib.pyplot as plt
import datapackage
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import json

In [71]:
# define a function that checks for null values
def check_for_null (x):
        value = x.isnull().values.any()
        return value

In [72]:
# Read in csv data file with data on world pupulation and fertility from 1950 until projected values up to 2100
dataPOP = pd.read_csv('https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_Period_Indicators_Medium.csv')
print(type(dataPOP))
dataPOP.head()

In [73]:
# call function to check for null values
val = check_for_null(dataPOP)

In [74]:
# check data types
print(dataPOP.info())

In [75]:
# Find country/countries the experienced the largest Crude Birth Rate over a 5 average and print largest 5 values.
dataPOP.groupby(['CBR']).max().tail(50)

In [76]:
# start (period) and end (period) of dataset = 1953 (1950 - 1955) until 2098 (2095 - 2100)
print(dataPOP.MidPeriod.min())
print(dataPOP.MidPeriod.max())

In [77]:

#print all column names
for tx, column in enumerate(dataPOP.columns):
    print(tx, column)

# Plot Crude Birth rate and Crude Death rate for Africa from 1950 until 2100
Af_CBR = dataPOP.CBR.loc[(dataPOP['Location'] == 'Africa')]
Af_CDR = dataPOP.CDR.loc[(dataPOP['Location'] == 'Africa')]

plt.plot(Af_CBR, label="CBR")
plt.plot(Af_CDR, label="CDR")
plt.title('CBR and CDR for Africa 1950 - 2100')
plt.xlabel('1950 - 2100')
plt.ylabel('Births/Deaths per 1000')
plt.legend()

In [78]:
check_for_null(Af_CBR)

In [79]:
check_for_null(Af_CDR)

In [80]:
# Plot Crude Birth rate and Crude Death rate for Asia from 1950 until 2100
Asia_CBR = dataPOP.CBR.loc[(dataPOP['Location'] == 'Asia')]
Asia_CDR = dataPOP.CDR.loc[(dataPOP['Location'] == 'Asia')]
plt.plot(Asia_CBR, label="CBR")
plt.plot(Asia_CDR, label="CDR")
plt.title('CBR and CDR for Asia 1950 - 2100')
plt.xlabel('1950 - 2100')
plt.ylabel('Births/Deaths per 1000')
#plt.show()
plt.legend()

In [81]:
# Plot Population Growth Rates for Africa and Asia from 1950 until 2100
Af_GR = dataPOP.GrowthRate[(dataPOP['Location'] == 'Africa')]
Asia_GR = dataPOP.GrowthRate[(dataPOP['Location'] == 'Asia')]
plt.subplot(1, 2, 1) # row 1, col 2 index 1
plt.plot(Af_GR, label="Africa")
plt.title("Africa Pop Growth Rate %")
plt.xlabel('1950 - 2100 ')
plt.ylabel('% Growth Rate ')

plt.subplot(1, 2, 2) # index 2
plt.plot(Asia_GR, label="Asia")
plt.title("Asia Pop Growth Rate %")
plt.xlabel('1950 - 2100 ')
plt.ylabel('% Growth Rate ')

plt.show()

In [82]:
check_for_null(Af_GR)

In [83]:
check_for_null(Asia_GR)

In [84]:
# Load data file with annual Age distributions for all countries from 1950 until 2100 (projected)
dataAgePOP = pd.read_csv('https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv')
dataAgePOP.head()

In [85]:
value = dataAgePOP.isnull().values.any()
value

In [86]:
#print all column names from loaded data file
for tx, column in enumerate(dataAgePOP.columns):
    print(tx, column)
    #type(column)

In [95]:
# save data on Japans Age distribution for 3 points in time - approx 70 years ago, now and ~70 years into the future (projected).
JP1950df = dataAgePOP.PopTotal[(dataAgePOP.Location == "Japan") & (dataAgePOP.Time == int("1950"))]
JP2020df = dataAgePOP.PopTotal[(dataAgePOP.Location == "Japan") & (dataAgePOP.Time == int("2020"))]
JP2090df = dataAgePOP.PopTotal[(dataAgePOP.Location == "Japan") & (dataAgePOP.Time == int("2090"))]
AgeGrpdf = dataAgePOP.AgeGrp[(dataAgePOP.Location == "Japan") & (dataAgePOP.Time == int("1950"))]

# create numpy array - this was to create adjacent bars in a bar chart for each age band with the 1950/2020/2090 values in tri-bars to 
# highlight the massive changes underway in Japans demographics. Unfortunatly time ran out to complete this.
arr = np.array([[JP1950df], [JP2020df], [JP2090df]])

# So with the three bar charts below we can see the massive changes underway in Japans populations age profile.
JP1950df.plot(kind="bar")
plt.title('Age Distribution in Japan in 1950')
plt.xlabel('1950 in 5 year Cohorts')
plt.ylabel('Total in 1000s')

In [96]:
JP2020df.plot(kind="bar")
plt.title('Age Distribution in Japan in 2020')
plt.xlabel('2020 in 5 year Cohorts')
plt.ylabel('Total in 1000s')

In [97]:
JP2090df.plot(kind="bar")
plt.title('Age Distribution in Japan in 2090')
plt.xlabel('2090 in 5 year Cohorts')
plt.ylabel('Total in 1000s')

In [98]:
type(dataAgePOP)
#dataAgePOP.describe()

In [99]:
# Read in json file with World Total population by country data from 1960 until 2018
# My plan was to find the 5 countries with the greatest % increase and decrease of population and examine th eresults but time ran out.
data_url = 'https://datahub.io/core/population/datapackage.json'

# to load Data Package into storage
package = datapackage.Package(data_url)

# to load only tabular data
resources = package.resources
for resource in resources:
    if resource.tabular:
        data = pd.read_csv(resource.descriptor['path'])
        
        
data.head()

In [100]:
# check for null values. None found.
print(data.isnull().sum().sum().sum().sum())

In [101]:
# start (period) and end (period) of dataset = 1953 (1950 - 1955) until 2098 (2095 - 2100)
print(data.Year.min())
print(data.Year.max())