In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl 
from pathlib import Path
import pickle

In [31]:
def get_html():
    context = ssl.create_default_context();
    context.check_hostname = False;
    context.verify_mode = ssl.CERT_NONE;
    #Reading HTML from the URL 
    url = 'https://www.cia.gov/library/publications/the-world-factbook/'
    print("Opening the file connection...")
    uh= urllib.request.urlopen(url, context=context)
    print("HTTP status",uh.getcode())
    html =uh.read().decode()
    print(f"Reading done. Total {len(html)} characters read.")
    return html;

In [33]:
def get_country_codes_names():    
    #passing the html to BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    country_codes=[]
    country_names=[]
    for tag in soup.find_all('option'):
        country_codes.append(tag.get('value')[5:7])
        country_names.append(tag.text)
    return country_codes , country_names;

In [37]:
def get_indv_contry_data():
    urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
    text_data=dict() # Empty data dictionary
    for i in range(1,len(country_names)-1): # Iterate over every country
        country_html=country_codes[i]+'.html'
        url_to_get=urlbase+country_html
        # Read the HTML from the URL and pass on to BeautifulSoup
        html = urllib.request.urlopen(url_to_get, context=context).read()
        soup = BeautifulSoup(html, 'html.parser')
        txt=soup.get_text()
        text_data[country_names[i]]=txt
        print(f"Finished loading data for {country_names[i]}") 
    print ("\n**Finished downloading all text data!**")
    return text_data;

In [39]:
my_file = Path("text_data_CIA_Factobook.p")
if my_file.is_file():
    text_data = pickle.load(open("text_data_CIA_Factobook.p", "rb"))
else:
    html = get_html();
    country_codes , country_names = get_country_codes_names();
    indv_contry_data = get_indv_contry_data();
    pickle.dump(indv_contry_data,open("text_data_CIA_Factobook.p", "wb"))

Opening the file connection...
HTTP status 200
Reading done. Total 75036 characters read.
Finished loading data for  World 
Finished loading data for  Afghanistan 
Finished loading data for  Akrotiri 
Finished loading data for  Albania 
Finished loading data for  Algeria 
Finished loading data for  American Samoa 
Finished loading data for  Andorra 
Finished loading data for  Angola 
Finished loading data for  Anguilla 
Finished loading data for  Antarctica 
Finished loading data for  Antigua and Barbuda 
Finished loading data for  Arctic Ocean 
Finished loading data for  Argentina 
Finished loading data for  Armenia 
Finished loading data for  Aruba 
Finished loading data for  Ashmore and Cartier Islands 
Finished loading data for  Atlantic Ocean 
Finished loading data for  Australia 
Finished loading data for  Austria 
Finished loading data for  Azerbaijan 
Finished loading data for  Bahamas, The 
Finished loading data for  Bahrain 
Finished loading data for  Baker Island 
Finished l

Finished loading data for  San Marino 
Finished loading data for  Sao Tome and Principe 
Finished loading data for  Saudi Arabia 
Finished loading data for  Senegal 
Finished loading data for  Serbia 
Finished loading data for  Seychelles 
Finished loading data for  Sierra Leone 
Finished loading data for  Singapore 
Finished loading data for  Sint Maarten 
Finished loading data for  Slovakia 
Finished loading data for  Slovenia 
Finished loading data for  Solomon Islands 
Finished loading data for  Somalia 
Finished loading data for  South Africa 
Finished loading data for  Southern Ocean 
Finished loading data for  South Georgia and South Sandwich Islands 
Finished loading data for  South Sudan 
Finished loading data for  Spain 
Finished loading data for  Spratly Islands 
Finished loading data for  Sri Lanka 
Finished loading data for  Sudan 
Finished loading data for  Suriname 
Finished loading data for  Svalbard 
Finished loading data for  Swaziland 
Finished loading data for  Swed

In [40]:
def convert_float(string):  
    if string.isnumeric():
        return float(string)
    if string[0].isdigit():
        if ',' not in string and '.' in string:
            return float(string)
        if ',' not in string and '.' not in string:
            idx=string.find(' ')
            result = string[:idx]
            return float(result)
        idx1=string.find(',')
        idx2=string.find(' ')
        result = string[:idx1]+string[idx1+1:idx2]
        return float(result)
    else:
        return (-1)

In [42]:
def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

In [45]:
indv_contry_data=removekey(indv_contry_data,' World ')


KeyError: ' World '

In [46]:
del country_names[0]
del country_names[0]
del country_codes[0]
del country_codes[0]

In [47]:
# Initialize dictionary for holding the data
Total_GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=indv_contry_data[country]       
    pos = txt.find('GDP (purchasing power parity):')
    if pos!=-1: 
        pos= pos+len('GDP (purchasing power parity):')
        string = txt[pos+1:pos+15]
        start = re.search('\$',string)
        end = re.search('[b,t]',string)
        if (start!=None and end!=None):
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            print(a)
            a = convert_float(a)
            if (string[end]=='t'):
                # If the GDP was in trillions, multiply it by 1000
                a=1000*a
            #print(f"Total GDP (PPP) of {country}: {a} billion")
            # Insert the data in the dictionary
            Total_GDP_PPP[country]=a
        else:
            print("**Could not find GDP data!**")
    else:
        print("**Could not find GDP data!**")

print ("\nFinished finding all GDP (Purchasing Power Parity) (in billion $) data")

**Could not find GDP data!**
35.87
629.3
**Could not find GDP data!**
3.327
192
**Could not find GDP data!**
**Could not find GDP data!**
2.39
**Could not find GDP data!**
911.5
27.21
2.516
**Could not find GDP data!**
**Could not find GDP data!**
1.235
434.1
166.8
9.339
69.77
**Could not find GDP data!**
686.5
4.919
175.9
526.4
3.23
25.29
5.198
7.011
83.5
43.85
39.55
**Could not find GDP data!**
3.219
**Could not find GDP data!**
**Could not find GDP data!**
32.91
152.4
35.68
330.9
7.985
3.734
64.21
81.55
1.764
2.507
3.395
29.64
452.1
23.12
NA
Agricu
**Could not find GDP data!**
**Could not find GDP data!**
712.5
1.323
67.99
29.16
**Could not find GDP data!**
**Could not find GDP data!**
85.2
96.27
100.2
132.9
3.128
31.19
372.6
285.5
**Could not find GDP data!**
3.64
**Could not find GDP data!**
172.6
188.5
1.199
56.9
29.38
9.631
41.2
195.8
**Could not find GDP data!**
2.001
8.647
242.4
2.826
5.49
**Could not find GDP data!**
36.75
3.582
**Could not find GDP data!**
39.32
4.15
130.2
2

In [55]:
df_GDP=pd.Series(Total_GDP_PPP).to_frame()
df_GDP.columns=['GDP (PPP)']
df_GDP.index.name='COUNTRY'

In [56]:
df_GDP

Unnamed: 0_level_0,GDP (PPP)
COUNTRY,Unnamed: 1_level_1
Albania,35.870
Algeria,629.300
Andorra,3.327
Angola,192.000
Antigua and Barbuda,2.390
Argentina,911.500
Armenia,27.210
Aruba,2.516
Australia,1235.000
Austria,434.100
