In [23]:
import requests
from bs4 import BeautifulSoup

# URL of the desktop Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_sovereign_states"

# Fetch the webpage content
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to retrieve webpage:", response.status_code)
    exit()

soup = BeautifulSoup(response.text, "html.parser")

# Find the table that contains the list of countries
table = soup.find("table", class_="wikitable")

# Debug: Check if the table is found
if not table:
    print("Could not find the country table!")
    exit()

# Extract country names from the first column of the table
countries = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if cells:
        country_name = cells[0].get_text(strip=True)
        countries.append(country_name)

# Print the extracted list of countries
print("Final country list:", countries)


Final country list: ['Afghanistan', 'Albania– Republic of Albania', "Algeria– People's Democratic Republic of Algeria", 'Andorra– Principality of Andorra', 'Angola– Republic of Angola', 'Antigua and Barbuda', 'Argentina– Argentine Republic[i]', 'Armenia– Republic of Armenia', 'Australia– Commonwealth of Australia', 'Austria– Republic of Austria', 'Azerbaijan– Republic of Azerbaijan[k]', 'Bahamas, The– Commonwealth of The Bahamas[14]', 'Bahrain– Kingdom of Bahrain', "Bangladesh– People's Republic of Bangladesh", 'Barbados', 'Belarus– Republic of Belarus', 'Belgium– Kingdom of Belgium', 'Belize', 'Benin– Republic of Benin', 'Bhutan– Kingdom of Bhutan', 'Bolivia– Plurinational State of Bolivia', 'Bosnia and Herzegovina', 'Botswana– Republic of Botswana', 'Brazil– Federative Republic of Brazil', 'Brunei– Brunei Darussalam', 'Bulgaria– Republic of Bulgaria', 'Burkina Faso', 'Burundi– Republic of Burundi', 'Cambodia– Kingdom of Cambodia', 'Cameroon– Republic of Cameroon', 'Canada[l]', 'Cape 

In [25]:
import re

# Original extracted list
country_list = [
    'Afghanistan', 'Albania– Republic of Albania', "Algeria– People's Democratic Republic of Algeria",
    'Andorra– Principality of Andorra', 'Angola– Republic of Angola', 'Antigua and Barbuda',
    'Argentina– Argentine Republic[i]', 'Armenia– Republic of Armenia', 'Australia– Commonwealth of Australia',
    'Austria– Republic of Austria', 'Azerbaijan– Republic of Azerbaijan[k]', 
    'Bahamas, The– Commonwealth of The Bahamas[14]', 'Bahrain– Kingdom of Bahrain',
    "Bangladesh– People's Republic of Bangladesh", 'Barbados', 'Belarus– Republic of Belarus',
    'Belgium– Kingdom of Belgium', 'Belize', 'Benin– Republic of Benin', 'Bhutan– Kingdom of Bhutan',
    'Bolivia– Plurinational State of Bolivia', 'Bosnia and Herzegovina', 'Botswana– Republic of Botswana',
    'Brazil– Federative Republic of Brazil', 'Brunei– Brunei Darussalam', 'Bulgaria– Republic of Bulgaria'
]

# Function to clean country names
def clean_country_names(countries):
    cleaned_countries = []
    for country in countries:
        country = re.sub(r'\[.*?\]', '', country)  # Remove text inside square brackets
        country = country.split('–')[0].strip()  # Keep only the part before "–"
        cleaned_countries.append(country)
    return cleaned_countries

# Get the cleaned country list
cleaned_country_list = clean_country_names(country_list)

# Print the cleaned list
print(cleaned_country_list)


['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria']


In [27]:
import re
import pandas as pd  # Import Pandas for DataFrame

# Original extracted list
country_list = [
    'Afghanistan', 'Albania– Republic of Albania', "Algeria– People's Democratic Republic of Algeria",
    'Andorra– Principality of Andorra', 'Angola– Republic of Angola', 'Antigua and Barbuda',
    'Argentina– Argentine Republic[i]', 'Armenia– Republic of Armenia', 'Australia– Commonwealth of Australia',
    'Austria– Republic of Austria', 'Azerbaijan– Republic of Azerbaijan[k]', 
    'Bahamas, The– Commonwealth of The Bahamas[14]', 'Bahrain– Kingdom of Bahrain',
    "Bangladesh– People's Republic of Bangladesh", 'Barbados', 'Belarus– Republic of Belarus',
    'Belgium– Kingdom of Belgium', 'Belize', 'Benin– Republic of Benin', 'Bhutan– Kingdom of Bhutan',
    'Bolivia– Plurinational State of Bolivia', 'Bosnia and Herzegovina', 'Botswana– Republic of Botswana',
    'Brazil– Federative Republic of Brazil', 'Brunei– Brunei Darussalam', 'Bulgaria– Republic of Bulgaria'
]

# Function to clean country names
def clean_country_names(countries):
    cleaned_countries = []
    for country in countries:
        country = re.sub(r'\[.*?\]', '', country)  # Remove text inside square brackets
        country = country.split('–')[0].strip()  # Keep only the part before "–"
        cleaned_countries.append(country)
    return cleaned_countries

# Get the cleaned country list
cleaned_country_list = clean_country_names(country_list)

# Create a DataFrame
df = pd.DataFrame(cleaned_country_list, columns=["Country Name"])

# Display as a table
print(df)  # Prints table in terminal


              Country Name
0              Afghanistan
1                  Albania
2                  Algeria
3                  Andorra
4                   Angola
5      Antigua and Barbuda
6                Argentina
7                  Armenia
8                Australia
9                  Austria
10              Azerbaijan
11            Bahamas, The
12                 Bahrain
13              Bangladesh
14                Barbados
15                 Belarus
16                 Belgium
17                  Belize
18                   Benin
19                  Bhutan
20                 Bolivia
21  Bosnia and Herzegovina
22                Botswana
23                  Brazil
24                  Brunei
25                Bulgaria


In [40]:
print(soup.title)

text = soup.get_text()

text = text.encode ('utf-8')

with open('listcountries.csv', 'wb') as f:
       f.write(text)

<title>List of sovereign states - Wikipedia</title>


In [42]:
from bs4 import BeautifulSoup

# Example HTML content (assuming you have the webpage content)
html_content = """<html><head><title>List of sovereign states - Wikipedia</title></head><body></body></html>"""

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Modify the title
soup.title.string = "Countries List"

# Print modified title
print(soup.title)  # Output: <title>Countries List</title>

# Extract text and save it as CSV
text = soup.get_text()
text = text.encode('utf-8')

with open('listcountries.csv', 'wb') as f:
    f.write(text)


<title>Countries List</title>


In [44]:
import os
print("File saved at:", os.getcwd())


File saved at: C:\Users\TN1\Desktop\career foundry\Visualization w Python\20th_century


In [46]:
import os
print("File saved at:", os.path.abspath("listcountries.csv"))


File saved at: C:\Users\TN1\Desktop\career foundry\Visualization w Python\20th_century\listcountries.csv


In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Get the webpage content
url = "https://en.wikipedia.org/wiki/List_of_sovereign_states"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the table containing country names
table = soup.find("table", {"class": "wikitable"})  # Wikipedia tables have class "wikitable"

# Extract country names
countries = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if cells:
        country_name = cells[0].text.strip()  # First column has country name
        countries.append([country_name])

# Save to CSV
df = pd.DataFrame(countries, columns=["Country"])
df.to_csv("listcountries.csv", index=False, encoding="utf-8")

print("File saved successfully at:", "listcountries.csv")


File saved successfully at: listcountries.csv
