In [1]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')


# 1. List of all countries

In [2]:
countries = []

In [3]:
table = soup.find('table', class_='sortable wikitable')
rows = table.find_all('tr')
                
for row in rows[3:-2]:
    first_cell = row.find('td')
    if first_cell:
        first_span = first_cell.find('span')
        if first_span and 'id' in first_span.attrs:
            countries.append(first_span['id'])

In [4]:
countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua_and_Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia_and_Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina_Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape_Verde',
 'Central_African_Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo,_Democratic_Republic_of_the',
 'Congo,_Republic_of_the',
 'Costa_Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech_Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican_Republic',
 'East_Timor',
 'Ecuador',
 'Egypt',
 'El_Salvador',
 'Equatorial_Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Hait

# 2. Gathering information about countries into a text file

In [5]:
links = []


for row in rows[3:-2]:
    first_cell = row.find('td')
    if first_cell:
        first_span = first_cell.find('span')
        link = first_cell.find('a')
        if first_span and 'id' in first_span.attrs:
            links.append(link['href'])

In [6]:
wiki_url = 'https://en.m.wikipedia.org'

In [7]:
country_info = []

for link in links: 
    final_url = wiki_url + link
    response = requests.get(final_url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    #Since all countries have different formats, we are looking for the one long enough to be descriptive
    for p in paragraphs[1:]:
        if len(p.text.strip()) > 70:
            paragraph = p
            break
    country_info.append(paragraph.text.strip().replace('\n', ' '))
    

In [8]:
zipped = zip(countries, country_info)

with open('output_wiki.txt', 'w', encoding='utf-8') as f:
    for country, info in zipped:
        f.write(country + '\n')
        if info.endswith('.'): #Removing dots which would start new lines. 
            info = info[:-1]
        f.write(info + '\n')

# 3. Creating a dataframe using information about each country

In [76]:
metrics = ['Country','Capital', 'Official language', 'Religion', 'Area', 'Population', 'GDP PPP', 'GDP Nominal', 'Currency']
result = []

In [77]:
for link in links:
    final_url = wiki_url + link
    response = requests.get(final_url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    #Return country name 
    country = soup.find('h1').text
    country_data = {'Country': country}
    #Return other metrics from the infobox 
    infobox = soup.find('table', class_='infobox')
    th_ = infobox.find_all('th')
    for metric in metrics: 
        for th in th_:
            if metric.lower().split()[0] in th.text.lower().replace(' ', '').replace('\u00a0', '') and metric.lower().split()[-1] in th.text.lower().replace(' ', '').replace('\u00a0', ''):
                class_value = th.get('class')
                parent_class_value = th.parent.get('class')
                parent_element = th.parent
                if class_value[0] == 'infobox-label' and parent_class_value != ['mergedtoprow']:
                    value = th.find_next_sibling('td', class_='infobox-data').text.strip()
                elif class_value[0] == 'infobox-label' and metric == 'Official language':
                    value = th.find_next_sibling('td', class_='infobox-data').text.strip()               
                elif parent_class_value == ['mergedtoprow']:
                    # get all siblings of the parent element
                    siblings = parent_element.find_next_siblings()
                    # create an empty list to store the text of the siblings
                    sibling_texts = []
                    # loop through the siblings and append their text to the list
                    for sibling in siblings:
                        if 'class' in sibling.attrs:
                            # check if the class attribute contains 'mergedtoprow'
                            if 'mergedtoprow' in sibling['class']:
                                break  # stop looping if we've reached the next 'mergedtoprow' sibling
                            sibling_text = sibling.get_text(strip=True)
                            if sibling_text:  # only append non-empty text
                                sibling_texts.append(sibling_text)
                    value = (' '.join(sibling_texts))
                country_data[metric] = value
    result.append(country_data)

In [79]:
import pandas as pd

df = pd.DataFrame(result)
df.set_index('Country', inplace=True)
df.head()

Unnamed: 0_level_0,Capital,Official language,Religion,Area,Population,GDP PPP,GDP Nominal,Currency
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,Kabul34°31′N 69°11′E﻿ / ﻿34.517°N 69.183°E﻿ / ...,PashtoDari,99.7% Islam (official)0.3% Others,"• Total652,867[16]km2(252,073 sq mi) (40th) • ...","• 2022 estimate38,346,720[17](37th) • Density4...",• Total$72.911 billion[18](96th) • Per capita$...,• Total$21.657 billion[18](111st) • Per capita...,Afghani (افغانی) (AFN)
Albania,Tirana41°19′N 19°49′E﻿ / ﻿41.317°N 19.817°E﻿ /...,Albanian,59% Islam17% Christianity9% No religion15% Und...,"• Total28,748 km2(11,100 sq mi) (140th) • Wate...","• January 2022 estimate2,793,592[2] • 2011 cen...",• Total$51.1 billion[4](118th) • Per capita$17...,• Total$18.25 billion[4](125th) • Per capita$6...,Lek (ALL)
Algeria,Algiers36°42′N 3°13′E﻿ / ﻿36.700°N 3.217°E﻿ / ...,Arabic\nTamazight,99% Sunni Islam (official)1% other (inc. Chris...,"• Total2,381,741 km2(919,595 sq mi) (10th) • W...","• 2021 estimate44,700,000[5](32nd) • Density17...",• Total$708.568 billion[6](43rd) • Per capita$...,• Total$190.155 billion[6](58th) • Per capita$...,Algerian dinar (DZD)
Andorra,Andorra la Vella42°30′23″N 1°31′17″E﻿ / ﻿42.50...,Catalan[1][a],Christianity (Catholicism),• Total467.63 km2(180.55 sq mi) (178th) • Wate...,"• 2022 estimate79,877[9](203rd) • Density179.8...",,"• TotalUS$3.400 billion[10] • Per capita$41,930",Euro (€)[d] (EUR)
Angola,Luanda8°50′S 13°20′E﻿ / ﻿8.833°S 13.333°E﻿ / -...,Portuguese,92.9% Christianity—53.9% Roman Catholic—27.4% ...,"• Total1,246,700 km2(481,400 sq mi) (22nd) • W...","• 2022 estimate34,795,287[4](42nd) • 2014 cens...",• Total$245.44 billion[6](62nd) • Per capita$7...,• Total$124.79 billion[6](61st) • Per capita$3...,Angolan kwanza (AOA)


In [81]:
df.to_csv('Countries.csv', encoding='utf-8')
df.to_excel('Countries.xlsx', encoding ='utf-8')