# Parse International Census Data from United States Census Bureau 

Dataset gateway is [here](https://www.census.gov/data-tools/demo/idb/informationGateway.php), and the full dataset can be downloaded [here](https://www2.census.gov/programs-surveys/international-programs/about/idb/idbzip.zip), see [here](https://www.census.gov/programs-surveys/international-programs/about/idb/faq.html#par_textimage_2).

See also Google's [COVID-19 public datasets](https://console.cloud.google.com/marketplace/details/bigquery-public-datasets/covid19-public-data-program?_ga=2.118817228.-458258001.1586199431&pli=1)

In [None]:
import pandas as pd
from os.path import join

## Download census data

In [None]:
%sx wget -O ./site-data/census-data.zip https://www2.census.gov/programs-surveys/international-programs/about/idb/idbzip.zip

In [None]:
%sx unzip ./site-data/census-data.zip -d ./site-data/census-data

## Get population size of every country of the world

In [None]:
censusPath = join('site-data', 'census-data', 'idbzip')

In [None]:
countryCodes = pd.read_csv(join(censusPath, 'IDBextCTYS.txt'), sep='|', 
            names=['countryCode','countryName','landArea'])

In [None]:
countryCodes.head()

In [None]:
population = pd.read_csv(join(censusPath, 'IDBext001.txt'), sep='|', 
                         names=['countryCode', 'year', 'midYearPopulation'])

In [None]:
currentYear = pd.Timestamp.today().year

In [None]:
populationCurrent = population[population.year == currentYear]

In [None]:
populationCurrent.head()

In [None]:
populationWorld = pd.merge(countryCodes, populationCurrent, on = 'countryCode', how='left')
populationWorld = populationWorld[['countryCode', 'countryName', 'midYearPopulation']]

In [None]:
assert populationWorld['midYearPopulation'].isnull().sum() == 0

In [None]:
populationWorld.head()

In [None]:
populationWorld.shape

In [None]:
populationWorld.to_csv(join('site-data','census-data', 'populationCountries.csv'), index=False)

## Get age & sex distribution of every country of the world

In [None]:
ageSex = pd.read_csv(join(censusPath, 'IDBext094.txt'), sep='|', 
           names=['countryCode', 'year', 'totalFlag', 'startAge', 'isOpen', 'endAge', 
                  'midYearPopulationAll', 'midYearPopulationMale', 'midYearPopulationFemale'])

ageSex = ageSex[(ageSex.year == currentYear) & (ageSex.totalFlag == 'A')]

In [None]:
ageSexWorld = pd.merge(countryCodes, ageSex, on = 'countryCode', how='left')

In [None]:
ageSexWorld = ageSexWorld[['countryCode', 'countryName', 'startAge', 'isOpen', 
                                 'endAge', 'midYearPopulationMale', 'midYearPopulationFemale']]

In [None]:
assert ageSexWorld['midYearPopulationMale'].isnull().sum() == 0

In [None]:
assert ageSexWorld['midYearPopulationFemale'].isnull().sum() == 0

In [None]:
ageSexWorld.head()

In [None]:
ageSexWorld.shape

In [None]:
ageSexWorld.to_csv(join('site-data','census-data', 'ageSexCountries.csv'), index=False)

## Clean up

In [None]:
from os import remove
from shutil import rmtree

In [None]:
remove(join('./site-data', 'census-data.zip'))

In [None]:
remove('../.wget-hsts')

In [None]:
rmtree(join('./site-data', 'census-data', 'idbzip'))

In [None]:
!pwd

In [None]:
!ls site-data/cen