In [89]:
import geopandas
from bs4 import BeautifulSoup
import pandas
import requests
import time
import random
import numpy

In [90]:
path = geopandas.datasets.get_path("naturalearth_lowres")
world = geopandas.read_file(path)

# Replace country names to match CIA names

In [92]:
country_map = {'United States of America': 'United States',
               'Dem. Rep. Congo': 'Congo, Democratic Republic of the',
               'Dominican Rep.': 'Dominican Republic',
               'Bahamas': 'Bahamas, The',
               "Côte d'Ivoire": "Cote d'Ivoire",
               'Central African Rep.': 'Central African Republic',
               'Congo': 'Congo, Republic of the',
               'Eq. Guinea' : 'Equatorial Guinea',
               'eSwatini': 'Eswatini',
               'Gambia': 'Gambia, The',
               'Myanmar': 'Burma',
               'South Korea': 'Korea, South',
               'Turkey': 'Turkey (Turkiye)',
               'Bosnia and Herz.': 'Bosnia and Herzegovina',
               'S. Sudan': 'South Sudan'}

def replace_names(name):
    if name in country_map.keys():
        name = country_map[name]
    return name

world['name'] = world['name'].map(replace_names).unique()

# Get From CIA

In [93]:
ret = requests.get('https://www.cia.gov/the-world-factbook/references/guide-to-country-comparisons/')
soup = BeautifulSoup(ret.text, 'html.parser')

In [94]:
links = soup.find_all("a", {"class": "link-button bold"})

In [95]:
for link in links:
    name = link.text    
    try:
        table = pandas.read_html('https://www.cia.gov/' + link['href'])[0]
    except ValueError as e:
        print(e)
        continue
    table = table.drop(columns=['Rank', 'Date of Information'] )
    table = table.rename(columns={table.columns[1]: name, 'Country':'name'})
    try:
        world = world.join(table.set_index('name'), on='name')
    except ValueError as e:
        print(e)
        continue            
    time.sleep(random.random())    

No tables found
No tables found
columns overlap but no suffix specified: Index(['Youth unemployment rate (ages 15-24)'], dtype='object')


# Removing dollar symbol

In [100]:
def remove_dollar(row):
    if isinstance(row, (float, int)):
        return row
    row = row.replace('$','')
    row = row.replace(',','')
    row = row.strip()
    return row

In [101]:
keys = ['Real GDP (purchasing power parity)', 
        'Real GDP per capita', 
        'Current account balance', 
        'Exports', 
        'Imports', 
        'Reserves of foreign exchange and gold', 
        'Debt - external']
for key in keys:
    world[key] = world[key].map(remove_dollar).astype(float)

In [None]:
world[world['Population']==-99]

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry,Area,Population,Median age,Population growth rate,...,Telephones - fixed lines,Telephones - mobile cellular,Internet users,Broadband - fixed subscriptions,Airports,Railways,Roadways,Waterways,Merchant marine,Military expenditures
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
20,3398.0,South America,Falkland Is.,FLK,282,"POLYGON ((-61.20000 -51.85000, -60.00000 -51.2...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
23,140.0,Seven seas (open ocean),Fr. S. Antarctic Lands,ATF,16,"POLYGON ((68.93500 -48.62500, 69.58000 -48.940...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
79,4685306.0,Asia,Palestine,PSE,16276,"POLYGON ((35.39756 31.48909, 34.92741 31.35344...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
95,25666161.0,Asia,North Korea,PRK,40000,"MULTIPOLYGON (((130.78000 42.22001, 130.78000 ...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
135,669823.0,Oceania,Solomon Is.,SLB,1589,"MULTIPOLYGON (((162.11902 -10.48272, 162.39865...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
159,4490.0,Antarctica,Antarctica,ATA,898,"MULTIPOLYGON (((-48.66062 -78.04702, -48.15140...",14200000.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,4400.0,-99.0,17.0,-99.0,-99.0,-99.0,-99.0,-99.0
160,326000.0,Asia,N. Cyprus,CYN,3600,"POLYGON ((32.73178 35.14003, 32.80247 35.14550...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
167,5096159.0,Africa,Somaliland,SOL,17836,"POLYGON ((48.94820 11.41062, 48.94820 11.41062...",-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0


In [83]:
world.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 55 columns):
 #   Column                                                  Non-Null Count  Dtype   
---  ------                                                  --------------  -----   
 0   pop_est                                                 177 non-null    float64 
 1   continent                                               177 non-null    object  
 2   name                                                    177 non-null    object  
 3   iso_a3                                                  177 non-null    object  
 4   gdp_md_est                                              177 non-null    int64   
 5   geometry                                                177 non-null    geometry
 6   Area                                                    177 non-null    float64 
 7   Population                                              177 non-null    float64 
 8   Median age            

In [103]:
world = world.fillna(-99)
#world.to_file('world.gpkg')