# Tasks for laboratory assignment 1

In [4]:
# imports section

import requests
from bs4 import BeautifulSoup
import json

import pandas as pd
from datetime import datetime
import csv
import numpy as np
import matplotlib.pyplot as plt

## Extract webpage data given the url

Create a Python script that performs basic web scraping on a page to extract all the information into text and returns it as a string.
String should not contain tags.

In [8]:

import re



def parse_web_page(url: str):
    """
    Fetch the content of the given web page.

    Args:
        url (str): The URL of the web page to fetch.

    Returns:
        str: The content of the page as a string, without HTML tags.

    Raises:
        requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
    """

    # Define a User-Agent header to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # 1. Fetch the webpage content, now with headers
        response = requests.get(url, headers=headers)

        # 2. Raise an error if the request was unsuccessful
        response.raise_for_status()

        # 3. Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # 4. Extract all text from the page.
        text = soup.get_text()

        # 5. (Optional but recommended) Clean up excessive whitespace
        # This line requires the 'import re' module
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - URL: {url}")
        raise
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err} - URL: {url}")
        print("Please check your internet connection or if the domain is correct.")
        raise
    except Exception as err:
        print(f"An other error occurred: {err} - URL: {url}")
        raise # Re-raise any other errors


# Example 1: FMI site
try:
    print(parse_web_page('https://fmi.chnu.edu.ua/')[:255])
except Exception as e:
    print(f"Could not parse FMI site. Error: {e}")

print("-" * 20) # Separator

# Example 2: Wikipedia
try:
    print(parse_web_page('https://en.wikipedia.org/wiki/Web_scraping')[:255])
except Exception as e:
    print(f"Could not parse Wikipedia site. Error: {e}")

Головна - Факультет математики та інформатики Перейти до основного вмісту [email protected] 58012, Україна, м. Чернівці, вул. Університетська, 28 Новини Всі Загальні Оголошення Події Студенту Викладачу Вітання Діяльність Наукова Конференції Семінари Аспір
--------------------
Web scraping - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us Contribute HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages Search S


## Get data from the API

Create a python script that performs basic request to API endpoint and saves that data to a JSON file `result.json`.

In [9]:


def parse_api(api_url: str):
    """
    Fetch the data of the given API endpoint and save it to result.json.

    Args:
        api_url (str): The URL of the API endpoint.

    Returns:
        None.

    Raises:
        requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
        requests.exceptions.JSONDecodeError: If the response is not valid JSON.
    """

    # Define headers to mimic a browser.
    # Some APIs, like GitHub's, appreciate a User-Agent.
    # 'Accept: application/json' also clearly states we want JSON.
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json'
    }

    output_filename = 'result.json'

    try:
        # 1. Fetch the API data using requests.get()
        response = requests.get(api_url, headers=headers)

        # 2. Raise an error for bad responses (e.g., 404, 403, 500)
        # This fulfills the "Raises: HTTPError" requirement.
        response.raise_for_status()

        # 3. Parse the response as JSON.
        # This will raise a JSONDecodeError if the server's response
        # is not valid JSON (e.g., if it's HTML).
        data = response.json()

        # 4. Save the parsed data to the 'result.json' file
        # 'w' mode means "write", which overwrites the file each time.
        # 'encoding='utf-8'' is standard for handling international characters.
        # 'indent=4' makes the final .json file nicely formatted and readable.
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)

        print(f"✅ Successfully fetched data from {api_url} and saved to {output_filename}")

    except requests.exceptions.HTTPError as http_err:
        print(f"❌ HTTP error occurred: {http_err} - URL: {api_url}")
        raise # Re-raise the exception as required by the docstring

    except requests.exceptions.JSONDecodeError as json_err:
        print(f"❌ Failed to decode JSON from response. - URL: {api_url}")
        print(f"   Error detail: {json_err}")
        print("   This often happens if the URL is a website (HTML) instead of an API (JSON).")
        # Show the first 150 chars of the (non-JSON) response
        print(f"   Response text started with: {response.text[:150]}...")
        raise # Re-raise the exception

    except requests.exceptions.RequestException as req_err:
        # Catch other request-related errors (e.g., connection timed out)
        print(f"❌ A request error occurred: {req_err} - URL: {api_url}")
        raise

    except Exception as err:
        print(f"❌ An other unexpected error occurred: {err} - URL: {api_url}")
        raise

# --- Your example calls ---

# 1. GitHub API (This should work successfully)
try:
    print("Attempting to parse GitHub API...")
    parse_api('https://api.github.com/')
except Exception as e:
    print(f"Could not parse GitHub API. Final error: {e}\n")

print("-" * 20)


# Second example: A public JSON test API
# This will overwrite the result.json file with new data.
try:
    print("Attempting to parse JSONPlaceholder API...")
    parse_api('https://jsonplaceholder.typicode.com/posts/1')
except Exception as e:
    print(f"Could not parse JSONPlaceholder API. Final error: {e}\n")

Attempting to parse GitHub API...
✅ Successfully fetched data from https://api.github.com/ and saved to result.json
--------------------
Attempting to parse JSONPlaceholder API...
✅ Successfully fetched data from https://jsonplaceholder.typicode.com/posts/1 and saved to result.json


## Parse the json file

Parse the `weather.json` file and return weather data for a specific date, that is given as a parameter. Return the data as an array.

In [13]:
import os
def parse_json(date: str):
    """
    Parse the data from weather.json file and return weather data for a given date.

    Args:
        date (str): The date for which we look up the weather.

    Returns:
        list: A list containing the weather data dictionary for the given date.
              Returns an empty list if the date is not found or an error occurs.
    """

    # 1. Construct the path to the file in the 'resources' directory
    file_path = os.path.join('resources', 'weather.json')

    try:
        # 2. Open and load the entire JSON structure
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # 3. Access the list of daily forecasts.
        # Use .get() to avoid errors if the 'daily' key is missing
        daily_forecasts = data.get('daily', [])

        # 4. Loop through the list to find the matching date
        for forecast in daily_forecasts:
            # Use .get('date') to avoid an error if a forecast item
            # is malformed and missing its 'date' key
            if forecast.get('date') == date:
                # 5. Found it. Return it as a list, as per the docstring.
                return [forecast]

        # 6. If the loop finishes, the date was not found.
        return []

    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return []  # Return an empty list
    except json.JSONDecodeError:
        print(f"Error: Failed to decode JSON. Check if {file_path} is a valid JSON file.")
        return []
    except AttributeError:
        # This will catch errors if 'data' is not a dictionary (e.g., if 'data' is a list)
        # or if 'daily_forecasts' is not a list.
        print(f"Error: The JSON structure in {file_path} is not as expected. Missing 'daily' key?")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# --- Example call ---
target_date = '2024-08-19'

print(f"Weather data for {target_date}:")
print(parse_json(target_date))

# --- A test for a date that does not exist ---
# This one will still correctly return []
print("\nWeather data for '1999-01-01':")
print(parse_json('1999-01-01'))

Weather data for 2024-08-19:
[{'date': '2024-08-19', 'max_temperature': 30.0, 'min_temperature': 21.0, 'precipitation': 5.0, 'wind_speed': 10.0, 'humidity': 70, 'weather_description': 'Light rain'}]

Weather data for '1999-01-01':
[]


## Parse the csv file

Parse the `weather.csv` file and return weather data for a specific date, that is given as a parameter. Return the data as an array.

In [21]:

import os

def parse_csv(date: str):
    """
    Parse the data from weather.csv file and return weather data for a given date.

    Args:
        date (str): The date for which we look up the weather.

    Returns:
        list: a list of weather data for a given date.
    """

    file_path = os.path.join('resources', 'weather.csv')
    found_data = []

    # --- DEBUGGING ---
    first_row_printed = False
    # --- END DEBUGGING ---

    try:
        with open(file_path, mode='r', encoding='utf-8', newline='') as f:
            reader = csv.DictReader(f)

            # --- DEBUGGING PRINT 1 ---
            # This is still useful to confirm the headers.
            #print(f"[DEBUG] CSV Headers found: {reader.fieldnames}")
            # --- END DEBUGGING ---

            for row in reader:

                # --- START OF FIX ---
                # Get the date from the 'CET' column
                date_from_file = row.get('CET')
                # --- END OF FIX ---

                # --- CHECK (Updated) ---
                # Check if the date from the 'CET' column matches our target date.
                # We also .strip() to remove any accidental leading/trailing spaces.
                if date_from_file is not None and date_from_file.strip() == date:
                    found_data.append(row)

        return found_data

    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# --- Your example call ---
# This was your original date. Let's see if it matches the format in the file.
target_date = '1997-5-22'
print(f"Weather data for {target_date}:")
print(parse_csv(target_date))


Weather data for 1997-5-22:
[{'CET': '1997-5-22', 'Max TemperatureC': '25', 'Mean TemperatureC': '18', 'Min TemperatureC': '10', 'Dew PointC': '11', 'MeanDew PointC': '8', 'Min DewpointC': '6', 'Max Humidity': '88', ' Mean Humidity': '54', ' Min Humidity': '34', ' Max Sea Level PressurehPa': '1017', ' Mean Sea Level PressurehPa': '1015', ' Min Sea Level PressurehPa': '1012', ' Max VisibilityKm': '10', ' Mean VisibilityKm': '10', ' Min VisibilitykM': '10', ' Max Wind SpeedKm/h': '11', ' Mean Wind SpeedKm/h': '3', ' Max Gust SpeedKm/h': '', 'Precipitationmm': '0.00', ' CloudCover': '3', ' Events': '', 'WindDirDegrees': '277'}]


## Visualize data

Visualize the `weather.csv` data using matplotlib. Choose your own approach to data visualization. Save the results (as `.png`, `.webp` files etc., your choise) in this repository. 

In [22]:
import csv
import os
import matplotlib.pyplot as plt
from datetime import datetime

def visualize_data():
    """
    Parse the data from weather.csv file and visualize it using Matplotlib.
    More than one visualization will be created.
    Save the results in the repository.
    """

    # --- 1. Define File Paths ---
    csv_file_path = os.path.join('resources', 'weather.csv')

    # Create a 'plots' directory to save our images if it doesn't exist
    plots_dir = 'plots'
    os.makedirs(plots_dir, exist_ok=True)

    plot1_save_path = os.path.join(plots_dir, 'temperature_over_time.png')
    plot2_save_path = os.path.join(plots_dir, 'precipitation_histogram.png')

    # --- 2. Read and Process Data ---
    # We will store our clean data in these lists
    dates = []
    max_temps = []
    precipitations = []

    try:
        with open(csv_file_path, mode='r', encoding='utf-8', newline='') as f:
            reader = csv.DictReader(f)

            for row in reader:
                try:
                    # We must convert strings to usable data types for plotting.
                    # We use a try/except block inside the loop to skip
                    # any single row that has bad data (like an empty string)

                    # Convert date string (from 'CET' col) to a datetime object
                    date_obj = datetime.strptime(row['CET'], '%Y-%m-%d')

                    # Convert temperature string to a float
                    temp_float = float(row['Max TemperatureC'])

                    # Convert precipitation string to a float
                    precip_float = float(row['Precipitationmm'])

                    # If all conversions work, add the data to our lists
                    dates.append(date_obj)
                    max_temps.append(temp_float)
                    precipitations.append(precip_float)

                except (ValueError, TypeError, KeyError):
                    # This catches errors if a value is missing, empty (''),
                    # or just not a valid number. We simply skip that row.
                    pass

        print(f"Successfully read and processed {len(dates)} valid data rows.")
        if not dates:
            print("No valid data found to plot. Exiting.")
            return

    except FileNotFoundError:
        print(f"Error: The file {csv_file_path} was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")
        return

    # --- 3. Visualization 1: Temperature Line Plot ---
    try:
        print(f"Creating Visualization 1: Temperature over Time...")
        plt.figure(figsize=(14, 7))  # Set a nice wide size for a time series

        plt.plot(dates, max_temps, linestyle='-', color='b')

        plt.title('Maximum Daily Temperature Over Time', fontsize=16)
        plt.xlabel('Date', fontsize=12)
        plt.ylabel('Max Temperature (°C)', fontsize=12)
        plt.grid(True)

        # Improve date formatting on the x-axis
        plt.gcf().autofmt_xdate()

        plt.tight_layout() # Adjusts plot to prevent labels from being cut off
        plt.savefig(plot1_save_path)
        print(f"✅ Plot 1 saved to {plot1_save_path}")
        plt.close() # Close the figure to start fresh for the next plot

    except Exception as e:
        print(f"❌ Error creating plot 1: {e}")

    # --- 4. Visualization 2: Precipitation Histogram ---
    try:
        print(f"Creating Visualization 2: Precipitation Histogram...")
        plt.figure(figsize=(10, 6))

        # Bins='auto' tries to find an optimal number of bins
        plt.hist(precipitations, bins='auto', color='c', edgecolor='black')

        plt.title('Distribution of Daily Precipitation', fontsize=16)
        plt.xlabel('Precipitation (mm)', fontsize=12)
        plt.ylabel('Number of Days (Frequency)', fontsize=12)
        plt.grid(axis='y', linestyle='--')

        # Set a logarithmic scale for the y-axis, as most days
        # will have 0 or very little rain. This helps see the variation.
        plt.yscale('log')
        print(" (Note: Y-axis is on a log scale to show infrequent heavy rain events)")

        plt.tight_layout()
        plt.savefig(plot2_save_path)
        print(f"✅ Plot 2 saved to {plot2_save_path}")
        plt.close()

    except Exception as e:
        print(f"❌ Error creating plot 2: {e}")

# --- Call the function ---
visualize_data()

Successfully read and processed 6810 valid data rows.
Creating Visualization 1: Temperature over Time...
✅ Plot 1 saved to plots\temperature_over_time.png
Creating Visualization 2: Precipitation Histogram...
 (Note: Y-axis is on a log scale to show infrequent heavy rain events)
✅ Plot 2 saved to plots\precipitation_histogram.png
