In [9]:
# Ahmad M. Osman - DS320
import urllib
import ssl
from bs4 import BeautifulSoup

# Reading the front page HTML
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Read the HTML from the URL and pass on to BeautifulSoup
url = 'https://www.cia.gov/library/publications/the-world-factbook/'
print("Opening the file connection...")
uh= urllib.request.urlopen(url, context=ctx)
print("HTTP status",uh.getcode())
html = uh.read().decode()
print("Reading done. Total {} characters read.".format(len(html)))

soup = BeautifulSoup(html, 'html.parser')
country_codes=[]
country_names=[]

# Find the HTML tags named ‘option’
for tag in soup.find_all('option'):
    # The char 5 and 6 of the tag value represent the 2-character country code.
    country_codes.append(tag.get('value')[5:7])
    country_names.append(tag.text)
temp=country_codes.pop(0) # To remove the first entry 'World'
temp=country_names.pop(0) # To remove the first entry 'World'

# Download all the text data of all countries into a dictionary by scraping each page individually
# The key thing to identify is how the URL of each countries information page is structured

# Base URL
urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
# Empty data dictionary
text_data=dict()
# Iterate over every country
for i in range(1,len(country_names)-1):
    country_html=country_codes[i]+'.html'
    url_to_get=urlbase+country_html
    # Read the HTML from the URL and pass on to BeautifulSoup
    html = urllib.request.urlopen(url_to_get, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    txt=soup.get_text()
    text_data[country_names[i]]=txt
    print("Finished loading data for {}".format(country_names[i]))
    
print ("\n**Finished downloading all text data!**")

Opening the file connection...
HTTP status 200
Reading done. Total 75080 characters read.
Finished loading data for  Afghanistan 
Finished loading data for  Akrotiri 
Finished loading data for  Albania 
Finished loading data for  Algeria 
Finished loading data for  American Samoa 
Finished loading data for  Andorra 
Finished loading data for  Angola 
Finished loading data for  Anguilla 
Finished loading data for  Antarctica 
Finished loading data for  Antigua and Barbuda 
Finished loading data for  Arctic Ocean 
Finished loading data for  Argentina 
Finished loading data for  Armenia 
Finished loading data for  Aruba 
Finished loading data for  Ashmore and Cartier Islands 
Finished loading data for  Atlantic Ocean 
Finished loading data for  Australia 
Finished loading data for  Austria 
Finished loading data for  Azerbaijan 
Finished loading data for  Bahamas, The 
Finished loading data for  Bahrain 
Finished loading data for  Baker Island 
Finished loading data for  Bangladesh 
Finis

Finished loading data for  San Marino 
Finished loading data for  Sao Tome and Principe 
Finished loading data for  Saudi Arabia 
Finished loading data for  Senegal 
Finished loading data for  Serbia 
Finished loading data for  Seychelles 
Finished loading data for  Sierra Leone 
Finished loading data for  Singapore 
Finished loading data for  Sint Maarten 
Finished loading data for  Slovakia 
Finished loading data for  Slovenia 
Finished loading data for  Solomon Islands 
Finished loading data for  Somalia 
Finished loading data for  South Africa 
Finished loading data for  Southern Ocean 
Finished loading data for  South Georgia and South Sandwich Islands 
Finished loading data for  South Sudan 
Finished loading data for  Spain 
Finished loading data for  Spratly Islands 
Finished loading data for  Sri Lanka 
Finished loading data for  Sudan 
Finished loading data for  Suriname 
Finished loading data for  Svalbard 
Finished loading data for  Sweden 
Finished loading data for  Switzer

In [25]:
# Store in a Pickle dump if you like
# For good measure, serialize and store this data in a Python pickle object
# Allows for reading data directly after the initial web crawling, from the Jupyter notebook without repeating the web crawling steps.

import pickle
pickle.dump(text_data,open("text_data_CIA_Factobook.p", "wb"))
# Unpickle and read the data from local storage next time
text_data = pickle.load(open("text_data_CIA_Factobook.p", "rb"))

def convert_float(string):
    if string.isnumeric():
        return float(string)
    if string[0].isdigit():
        if ',' not in string and '.' in string:
            return float(string)
        if ',' not in string and '.' not in string:
            idx=string.find(' ')
            result = string[:idx]
            return float(result)
        idx1=string.find(',')
        idx2=string.find(' ')
        result = string[:idx1]+string[idx1+1:idx2]
        return float(result)
    else:
        return (-1)

# Using regular expression to extract the GDP/capita data from the text dump
# 'b' to catch 'billions', 't' to catch 'trillions'
# Notice the multiple error-handling checks placed in the code. 
import re

# Initialize dictionary for holding the data
GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP - per capita (PPP):')
    if pos!=-1: #If the wording/phrase is not present
        pos= pos+len('GDP - per capita (PPP):')
        string = txt[pos+1:pos+11]
        start = re.search('\$',string)
        end = re.search('\S',string)
        if (start!=None and end!=None): #If search fails somehow
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            #print(a)
            a = convert_float(a)
            if (a!=-1.0): #If the float conversion fails somehow
                print("GDP/capita (PPP) of {}: {} dollars".format(country, a))
                # Insert the data in the dictionary
                GDP_PPP[country]=a
            else:
                print("**Could not find GDP/capita data!**")
        else:
            print("**Could not find GDP/capita data!**")
    else:
        print("**Could not find GDP/capita data!**")
print ("\nFinished finding all GDP/capita data")

GDP/capita (PPP) of  Afghanistan : 2000.0 dollars
**Could not find GDP/capita data!**
GDP/capita (PPP) of  Albania : 12500.0 dollars
GDP/capita (PPP) of  Algeria : 15200.0 dollars
GDP/capita (PPP) of  American Samoa : 11200.0 dollars
GDP/capita (PPP) of  Andorra : 49900.0 dollars
GDP/capita (PPP) of  Angola : 6800.0 dollars
GDP/capita (PPP) of  Anguilla : 12200.0 dollars
**Could not find GDP/capita data!**
GDP/capita (PPP) of  Antigua and Barbuda : 26300.0 dollars
**Could not find GDP/capita data!**
GDP/capita (PPP) of  Argentina : 20900.0 dollars
GDP/capita (PPP) of  Armenia : 9500.0 dollars
GDP/capita (PPP) of  Aruba : 25300.0 dollars
**Could not find GDP/capita data!**
**Could not find GDP/capita data!**
GDP/capita (PPP) of  Australia : 50300.0 dollars
GDP/capita (PPP) of  Austria : 49900.0 dollars
GDP/capita (PPP) of  Azerbaijan : 17500.0 dollars
GDP/capita (PPP) of  Bahamas, The : 31200.0 dollars
GDP/capita (PPP) of  Bahrain : 48500.0 dollars
**Could not find GDP/capita data!**
GD

In [28]:
# Using regular expression to extract the internet user percentage data from the text dump
# Initialize dictionary for holding the data
Internet_user = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('Internet users:')
    if pos!=-1: 
        pos= pos+len('Internet users: ')
        string = txt[pos:pos+50]
        #print(string)
        start=re.search('percent of population: ',string)
        end = re.search('%',string)
        if (start!=None and end!=None):
            start=start.end()
            end=end.start()
            a=string[start:end]
            if a[-1].isdigit():
                a = float(a)
                print("Internet users % of {}: {}".format(country, a))
                # Insert the data in the dictionary
                Internet_user[country]=a
            else:
                print("**Could not find Internet users data!**")
        else:
            print("**Could not find Internet users data!**")
    else:
        print("**Could not find Internet users data!**")

print ("\nFinished finding all Internet users data")


Internet users % of  Afghanistan : 10.6
**Could not find Internet users data!**
Internet users % of  Albania : 66.4
Internet users % of  Algeria : 42.9
Internet users % of  American Samoa : 31.3
Internet users % of  Andorra : 97.9
Internet users % of  Angola : 13.0
Internet users % of  Anguilla : 81.6
Internet users % of  Antarctica : 100.0
Internet users % of  Antigua and Barbuda : 65.2
**Could not find Internet users data!**
Internet users % of  Argentina : 70.2
Internet users % of  Armenia : 62.0
Internet users % of  Aruba : 93.5
**Could not find Internet users data!**
**Could not find Internet users data!**
Internet users % of  Australia : 88.2
Internet users % of  Austria : 84.3
Internet users % of  Azerbaijan : 78.2
Internet users % of  Bahamas, The : 80.0
Internet users % of  Bahrain : 98.0
**Could not find Internet users data!**
Internet users % of  Bangladesh : 18.2
Internet users % of  Barbados : 79.5
Internet users % of  Belarus : 71.1
Internet users % of  Belgium : 86.5
Int

In [29]:
Internet_user

{' Afghanistan ': 10.6,
 ' Albania ': 66.4,
 ' Algeria ': 42.9,
 ' American Samoa ': 31.3,
 ' Andorra ': 97.9,
 ' Angola ': 13.0,
 ' Anguilla ': 81.6,
 ' Antarctica ': 100.0,
 ' Antigua and Barbuda ': 65.2,
 ' Argentina ': 70.2,
 ' Armenia ': 62.0,
 ' Aruba ': 93.5,
 ' Australia ': 88.2,
 ' Austria ': 84.3,
 ' Azerbaijan ': 78.2,
 ' Bahamas, The ': 80.0,
 ' Bahrain ': 98.0,
 ' Bangladesh ': 18.2,
 ' Barbados ': 79.5,
 ' Belarus ': 71.1,
 ' Belgium ': 86.5,
 ' Belize ': 44.6,
 ' Benin ': 12.0,
 ' Bermuda ': 98.0,
 ' Bhutan ': 41.8,
 ' Bolivia ': 39.7,
 ' Bosnia and Herzegovina ': 69.3,
 ' Botswana ': 39.4,
 ' Brazil ': 59.7,
 ' British Virgin Islands ': 43.6,
 ' Brunei ': 71.2,
 ' Bulgaria ': 59.8,
 ' Burkina Faso ': 14.0,
 ' Burma ': 25.1,
 ' Burundi ': 5.2,
 ' Cabo Verde ': 48.2,
 ' Cambodia ': 25.6,
 ' Cameroon ': 25.0,
 ' Canada ': 89.8,
 ' Cayman Islands ': 79.0,
 ' Central African Republic ': 4.6,
 ' Chad ': 5.0,
 ' Chile ': 66.0,
 ' China ': 53.2,
 ' Christmas Island ': 35.8,
 ' 

In [30]:
GDP_PPP

{' Afghanistan ': 2000.0,
 ' Albania ': 12500.0,
 ' Algeria ': 15200.0,
 ' American Samoa ': 11200.0,
 ' Andorra ': 49900.0,
 ' Angola ': 6800.0,
 ' Anguilla ': 12200.0,
 ' Antigua and Barbuda ': 26300.0,
 ' Argentina ': 20900.0,
 ' Armenia ': 9500.0,
 ' Aruba ': 25300.0,
 ' Australia ': 50300.0,
 ' Austria ': 49900.0,
 ' Azerbaijan ': 17500.0,
 ' Bahamas, The ': 31200.0,
 ' Bahrain ': 48500.0,
 ' Bangladesh ': 4200.0,
 ' Barbados ': 18700.0,
 ' Belarus ': 18900.0,
 ' Belgium ': 46600.0,
 ' Belize ': 8300.0,
 ' Benin ': 2300.0,
 ' Bermuda ': 99400.0,
 ' Bhutan ': 8700.0,
 ' Bolivia ': 7500.0,
 ' Bosnia and Herzegovina ': 12700.0,
 ' Botswana ': 17800.0,
 ' Brazil ': 15600.0,
 ' British Virgin Islands ': 34200.0,
 ' Brunei ': 78200.0,
 ' Bulgaria ': 21700.0,
 ' Burkina Faso ': 1900.0,
 ' Burma ': 6200.0,
 ' Burundi ': 700.0,
 ' Cabo Verde ': 6900.0,
 ' Cambodia ': 4000.0,
 ' Cameroon ': 3700.0,
 ' Canada ': 48300.0,
 ' Cayman Islands ': 43800.0,
 ' Central African Republic ': 700.0,
 ' 

In [35]:
# Saving into CSV
import csv

with open('InternetUsers-GDPPerCapita.csv', mode='w') as csv_file:
    fieldnames = ['Country', 'Internet Users %', 'GDP/Capita']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for key, value in Internet_user.items():
        try:
            writer.writerow({'Country': key.strip(), 'Internet Users %': value, 'GDP/Capita': GDP_PPP[key]})
        except:
            continue