# Python Package Manager Exercises:

## Question 1:
Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'

In [1]:
import requests
import re
from collections import Counter

# Function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation using regular expression
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Function to find the most common words in the text
def find_most_common_words_from_url(url, n):
    # Download the content from the URL
    response = requests.get(url)
    text = response.text

    # Clean the text
    cleaned_text = clean_text(text)

    # Split the text into words
    words = cleaned_text.split()

    # Count word frequencies
    word_counts = Counter(words)

    # Get the n most common words as a list of tuples
    most_common = word_counts.most_common(n)

    return most_common


romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'

most_common_words = find_most_common_words_from_url(romeo_and_juliet_url, 10)

print(most_common_words)


[('lia', 23), ('a', 17), ('li', 14), ('meta', 13), ('div', 13), ('ul', 10), ('input', 9), ('link', 7), ('gutenberg', 5), ('of', 5)]


## Question 2:
Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
 * the min, max, mean, median, standard deviation of cats' weight in metric units.
* the min, max, mean, median, standard deviation of cats' lifespan in years.
* Create a frequency table of country and breed of cats

In [5]:
import requests
import numpy as np
import pandas as pd

# Fetch data from the API
cats_api = 'https://api.thecatapi.com/v1/breeds'
response = requests.get(cats_api)
cats_data = response.json()

# Extract weights and lifespans
weights = []
lifespan = []
countries = []
breeds = []

for cat in cats_data:
    # Extract weight in metric units (kg)
    if cat['weight']['metric']:
        weight = cat['weight']['metric'].split(" - ")
        weight_min = float(weight[0]) if weight[0] != "null" else None
        weight_max = float(weight[1]) if weight[1] != "null" else None
        if weight_min and weight_max:
            weights.append((weight_min + weight_max) / 2)  # Use the average weight if both min/max are available

    # Extract lifespan in years
    if cat['life_span']:
        lifespan.append(int(cat['life_span'].split(" ")[0]))  # Extract the first number (age in years)

    # Extract country of origin and breed name
    countries.append(cat['origin'])
    breeds.append(cat['name'])

# Calculate statistics for weight (in kg) and lifespan (in years)
weights = [w for w in weights if w is not None]  # Remove None values from weights

# Weight statistics
weight_min = np.min(weights)
weight_max = np.max(weights)
weight_mean = np.mean(weights)
weight_median = np.median(weights)
weight_std = np.std(weights)

# Lifespan statistics
lifespan_mean = np.mean(lifespan)
lifespan_median = np.median(lifespan)
lifespan_min = np.min(lifespan)
lifespan_max = np.max(lifespan)
lifespan_std = np.std(lifespan)

# Frequency table for country and breed
df = pd.DataFrame({
    'Country': countries,
    'Breed': breeds
})

# Create frequency table
country_breed_frequency = df.groupby(['Country', 'Breed']).size().reset_index(name='Count')

# Output the results
print(f"Weight statistics (in kg):")
print(f"Min: {weight_min:.2f}, Max: {weight_max:.2f}, Mean: {weight_mean:.2f}, Median: {weight_median:.2f}, Standard Deviation: {weight_std:.2f}")
print(f"\nLifespan statistics (in years):")
print(f"Min: {lifespan_min}, Max: {lifespan_max}, Mean: {lifespan_mean:.2f}, Median: {lifespan_median}, Standard Deviation: {lifespan_std:.2f}")
print("\nFrequency table of country and breed:")
print(country_breed_frequency.head())  # Show only the first few rows for brevity


Weight statistics (in kg):
Min: 3.00, Max: 7.50, Mean: 4.71, Median: 4.50, Standard Deviation: 1.06

Lifespan statistics (in years):
Min: 8, Max: 18, Mean: 12.07, Median: 12.0, Standard Deviation: 1.81

Frequency table of country and breed:
     Country             Breed  Count
0  Australia   Australian Mist      1
1      Burma           Burmese      1
2      Burma  European Burmese      1
3     Canada            Cymric      1
4     Canada            Sphynx      1


## Question 3:
Read the countries API and find
* the 10 largest countries
* the 10 most spoken languages
* the total number of languages in the countries API

In [6]:
import requests
from collections import Counter

# Fetch countries data from the API
url = "https://restcountries.com/v3.1/all"  # API endpoint that provides data for all countries
response = requests.get(url)

# Check if the response is successful
if response.status_code != 200:
    print("Error fetching data from the API.")
else:
    countries_data = response.json()

    # --- 1. Find the 10 largest countries by population ---
    countries_population = [(country['name']['common'], country['population']) for country in countries_data]
    countries_population.sort(key=lambda x: x[1], reverse=True)  # Sort by population in descending order
    largest_countries = countries_population[:10]

    print("Top 10 Largest Countries by Population:")
    for country, population in largest_countries:
        print(f"{country}: {population}")

    # --- 2. Find the 10 most spoken languages ---
    language_count = Counter()

    # Loop through each country and count the languages spoken
    for country in countries_data:
        if 'languages' in country:
            for language in country['languages'].values():
                language_count[language] += 1

    # Get the 10 most common languages
    most_spoken_languages = language_count.most_common(10)

    print("\nTop 10 Most Spoken Languages:")
    for language, count in most_spoken_languages:
        print(f"{language}: {count}")

    # --- 3. Count the total number of languages across all countries ---
    total_languages = len(language_count)
    print(f"\nTotal number of unique languages: {total_languages}")


Top 10 Largest Countries by Population:
China: 1402112000
India: 1380004385
United States: 329484123
Indonesia: 273523621
Pakistan: 220892331
Brazil: 212559409
Nigeria: 206139587
Bangladesh: 164689383
Russia: 144104080
Mexico: 128932753

Top 10 Most Spoken Languages:
English: 91
French: 46
Arabic: 25
Spanish: 24
Portuguese: 10
Dutch: 7
Russian: 7
German: 6
Chinese: 5
Italian: 4

Total number of unique languages: 155


## Question 4:
UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4

In [13]:
import requests
from bs4 import BeautifulSoup

# The URL for UCI Machine Learning Datasets page
url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
else:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main section where datasets are listed
    datasets_section = soup.find_all('tr')  # We can use all 'tr' tags to check the tables
    
    # Initialize a list to store dataset names and links
    datasets = []

    # Loop through all rows in the table (skip headers and other unnecessary rows)
    for row in datasets_section:
        # Find all columns within each row (this corresponds to dataset names and links)
        cols = row.find_all('td')
        if len(cols) > 1:  # This will ensure that we only work with rows that have data
            dataset_name = cols[0].get_text(strip=True)
            # The link to the dataset is within the first column and is wrapped inside an <a> tag
            link = cols[0].find('a')
            if link:
                dataset_link = 'https://archive.ics.uci.edu' + link.get('href')
                datasets.append((dataset_name, dataset_link))

    # Print the first 10 datasets (name and link)
    print("First 10 Datasets from the UCI Repository:")
    for i, dataset in enumerate(datasets[:10]):
        print(f"{i+1}. {dataset[0]} - {dataset[1]}")



Failed to retrieve data from https://archive.ics.uci.edu/ml/datasets.php. Status code: 404
