## Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [1]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# 2. find url and store it in a variable
url_python = "https://en.wikipedia.org/wiki/Python"

In [3]:
# 3. download html with a get request
response = requests.get(url_python)
response.status_code

200

In [4]:
# 4 parse html (create the 'soup')
soup_python = BeautifulSoup(response.content, "html.parser")

In [5]:
# Find all links in the page
links = soup_python.find_all('a', href=True)
# links

In [6]:
# Extract the links
link_urls = [link['href'] for link in links if link['href'].startswith('http')]
# link_urls

## Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

In [7]:
url_fbi = 'https://www.fbi.gov/wanted/topten'

In [8]:
response = requests.get(url_fbi)
response.status_code

403

In [9]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url_fbi, headers=headers)
response.status_code

200

In [10]:
soup_fbi = BeautifulSoup(response.content, "html.parser")

In [11]:
#solution 1 
names = soup_fbi.find_all('h3', class_ = 'title')
# names

In [12]:
name = soup_fbi.select('#query-results-0f737222c5054a81a120bce207b0446a > ul > li > h3 > a')

In [13]:
wanted_list = []
for i in names:
    wanted_list.append(i.text.strip())

In [14]:
wanted_list

['WILVER VILLEGAS-PALOMINO',
 "VITEL'HOMME INNOCENT",
 'ALEJANDRO ROSALES CASTILLO',
 'ALEXIS FLORES',
 'ARNOLDO JIMENEZ',
 'OMAR ALEXANDER CARDENAS',
 'YULAN ADONAY ARCHAGA CARIAS',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'DONALD EUGENE FIELDS II',
 'RUJA IGNATOVA']

## A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [15]:
url_uk = 'https://data.gov.uk/'

In [16]:
response = requests.get(url_uk)
response.status_code

200

In [17]:
soup_uk = BeautifulSoup(response.content, "html.parser")

In [18]:
# soup_uk

In [19]:
types = soup_uk.select('#main-content > div:nth-child(3) > div > ul > li > h3')
# types

In [20]:
data_type = []
for t in types:
    data_type.append(t.text.strip())

In [21]:
data_type

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

## Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [22]:
url_language = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [23]:
response = requests.get(url_language)
response.status_code

200

In [24]:
soup_lag = BeautifulSoup(response.content, "html.parser")

In [25]:
# find the tables that we need
table = soup_lag.find('table', {'class': 'wikitable'}) # first table of the page

rows = table.find_all("tr") # select all the rows

In [26]:
data = []

for row in rows[1:11]:  # Skip the header row and get the next 10 rows
    cols = row.find_all('td')
    if cols:  # This check ensures that there are enough columns in the row
        language = cols[0].text.strip()
        speakers = cols[1].text.strip().replace(',', '')  # Remove commas from numbers
        data.append((language, speakers))

df = pd.DataFrame(data,columns=["Language","Native Speakers"])
df



Unnamed: 0,Language,Native Speakers
0,Mandarin Chinese,939.0
1,Spanish,485.0
2,English,380.0
3,Hindi,345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese,86.1
9,Vietnamese,85.0
