# EXERCISES: LEVEL 20

In [1]:
import requests
from collections import Counter
import re

# URL of the text file
url = 'http://www.gutenberg.org/files/1112/1112.txt'

# Download the text from the URL
response = requests.get(url)
text = response.text

# Function to preprocess the text and find the 10 most frequent words
def find_most_frequent_words(text, top_n=10):
    # Remove non-alphabetic characters and convert to lowercase
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Count the frequency of each word
    word_counts = Counter(words)
    
    # Get the most common words
    most_common_words = word_counts.most_common(top_n)
    
    return most_common_words

# Find the 10 most frequent words
most_frequent_words = find_most_frequent_words(text)

# Print the result
print("Top 10 most frequent words:")
for word, count in most_frequent_words:
    print(f"{word}: {count}")


Top 10 most frequent words:
a: 70
li: 60
href: 40
class: 22
html: 20
gutenberg: 20
content: 14
div: 14
help: 14
meta: 13


# TASK 2

In [2]:
import numpy as np

# API URL for cat breeds
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the API
response = requests.get(cats_api)
breeds_data = response.json()

# Extract weights from the data and convert to metric units
weights = [breed['weight']['metric'] for breed in breeds_data if 'weight' in breed]

# Convert weights to numeric values
weights_numeric = [float(weight.split()[0]) for weight in weights]

# Calculate statistics
min_weight = np.min(weights_numeric)
max_weight = np.max(weights_numeric)
mean_weight = np.mean(weights_numeric)
median_weight = np.median(weights_numeric)
std_dev_weight = np.std(weights_numeric)

# Print the results
print(f"Min Weight: {min_weight} kg")
print(f"Max Weight: {max_weight} kg")
print(f"Mean Weight: {mean_weight} kg")
print(f"Median Weight: {median_weight} kg")
print(f"Standard Deviation of Weight: {std_dev_weight} kg")


Min Weight: 2.0 kg
Max Weight: 5.0 kg
Mean Weight: 3.2238805970149254 kg
Median Weight: 3.0 kg
Standard Deviation of Weight: 0.8779367862598653 kg


In [3]:
import numpy as np

# API URL for cat breeds
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the API
response = requests.get(cats_api)
breeds_data = response.json()

# Extract lifespan data from the data
lifespans = [breed.get('life_span') for breed in breeds_data if 'life_span' in breed]

# Convert lifespans to numeric values (extracting the average if there's a range)
lifespans_numeric = []
for lifespan in lifespans:
    if '-' in lifespan:
        start, end = map(int, lifespan.split('-'))
        lifespans_numeric.append((start + end) / 2)
    else:
        lifespans_numeric.append(float(lifespan))

# Calculate statistics
min_lifespan = np.min(lifespans_numeric)
max_lifespan = np.max(lifespans_numeric)
mean_lifespan = np.mean(lifespans_numeric)
median_lifespan = np.median(lifespans_numeric)
std_dev_lifespan = np.std(lifespans_numeric)

# Print the results
print(f"Min Lifespan: {min_lifespan} years")
print(f"Max Lifespan: {max_lifespan} years")
print(f"Mean Lifespan: {mean_lifespan} years")
print(f"Median Lifespan: {median_lifespan} years")
print(f"Standard Deviation of Lifespan: {std_dev_lifespan} years")


Min Lifespan: 10.5 years
Max Lifespan: 19.0 years
Mean Lifespan: 13.746268656716419 years
Median Lifespan: 13.5 years
Standard Deviation of Lifespan: 1.5725564658451314 years


In [4]:
# API URL for cat breeds
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the API
response = requests.get(cats_api)
breeds_data = response.json()

# Extract country and breed data
country_breed_data = [(breed.get('origin', 'Unknown'), breed.get('name', 'Unknown')) for breed in breeds_data]

# Create a frequency table
frequency_table = Counter(country_breed_data)

# Print the results
print("Frequency Table of Country and Breed of Cats:")
for (country, breed), count in frequency_table.items():
    print(f"{country}: {breed} - {count} occurrences")


Frequency Table of Country and Breed of Cats:
Egypt: Abyssinian - 1 occurrences
Greece: Aegean - 1 occurrences
United States: American Bobtail - 1 occurrences
United States: American Curl - 1 occurrences
United States: American Shorthair - 1 occurrences
United States: American Wirehair - 1 occurrences
United Arab Emirates: Arabian Mau - 1 occurrences
Australia: Australian Mist - 1 occurrences
United States: Balinese - 1 occurrences
United States: Bambino - 1 occurrences
United States: Bengal - 1 occurrences
France: Birman - 1 occurrences
United States: Bombay - 1 occurrences
United Kingdom: British Longhair - 1 occurrences
United Kingdom: British Shorthair - 1 occurrences
Burma: Burmese - 1 occurrences
United Kingdom: Burmilla - 1 occurrences
United States: California Spangled - 1 occurrences
United States: Chantilly-Tiffany - 1 occurrences
France: Chartreux - 1 occurrences
Egypt: Chausie - 1 occurrences
United States: Cheetoh - 1 occurrences
United States: Colorpoint Shorthair - 1 occ

# TASK 3

In [6]:
import requests

# API URL for countries
countries_api = 'https://restcountries.com/v2/all'

# Fetch data from the API
response = requests.get(countries_api)
countries_data = response.json()

# I. Find the 10 largest countries
largest_countries = sorted(countries_data, key=lambda x: x.get('area', 0), reverse=True)[:10]

print("10 Largest Countries:")
for country in largest_countries:
    print(f"{country['name']}: {country.get('area')} sq km")


10 Largest Countries:
Russian Federation: 17124442.0 sq km
Antarctica: 14000000.0 sq km
Canada: 9984670.0 sq km
China: 9640011.0 sq km
United States of America: 9629091.0 sq km
Brazil: 8515767.0 sq km
Australia: 7692024.0 sq km
India: 3287590.0 sq km
Argentina: 2780400.0 sq km
Kazakhstan: 2724900.0 sq km


In [7]:
# II. Find the 10 most spoken languages
all_languages = [language['name'] for country in countries_data for language in country.get('languages', [])]
most_spoken_languages = dict(sorted(Counter(all_languages).items(), key=lambda x: x[1], reverse=True)[:10])

print("\n10 Most Spoken Languages:")
for language, count in most_spoken_languages.items():
    print(f"{language}: {count} countries")



10 Most Spoken Languages:
English: 91 countries
French: 45 countries
Arabic: 25 countries
Spanish: 24 countries
Portuguese: 10 countries
Russian: 8 countries
Dutch: 8 countries
German: 7 countries
Chinese: 5 countries
Serbian: 4 countries


In [8]:
# III. Find the total number of languages
unique_languages = set(all_languages)
total_languages = len(unique_languages)

print(f"\nTotal Number of Languages: {total_languages}")


Total Number of Languages: 123


# TASK 5

In [12]:
import requests
from bs4 import BeautifulSoup

# URL of the UCI Machine Learning Repository
uci_url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Send a GET request to the URL
response = requests.get(uci_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract information, for example, titles of datasets
    dataset_titles = [title.text.strip() for title in soup.select('.normal a b')]

    # Print the titles
    for i, title in enumerate(dataset_titles, start=1):
        print(f"{i}. {title}")

else:
    print(f"Failed to fetch content. Status code: {response.status_code}")


Failed to fetch content. Status code: 404
