In [None]:
#Do not chance these dependencies
import pytest
# Import your dependencies here
import requests
from bs4 import BeautifulSoup
import json
import spacy
import datetime

In [None]:
def bbc_scraper(url):
    """
    This function should take a url, which will relate to a bbc news article 
    and return a json object containing the following fields:
    1) URL (provided.  For example https://www.bbc.co.uk/news/uk-51004218)
    2) Title
    3) Date_published
    4) Content --(the main body of article, this must be one continuous string without linebreaks)
    The function must be iterable (If placed in a for loop and provided with several URLs in 
    turn return the correct json object for each time it is invoked without any manual intervention)
    """
    # Need this header or BBC blocks the request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        # Get the webpage content
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the title in h1 tag
        title_element = soup.find('h1')
        title = title_element.get_text().strip() if title_element else "Title not found"
        
        # Sort out the date format
        time_element = soup.find('time')
        if time_element:
            date = time_element.get('datetime', '')
            try:
                # Convert to readable format
                date_obj = datetime.datetime.fromisoformat(date.replace('Z', '+00:00'))
                date = date_obj.strftime('%-d %B %Y')
            except:
                date = time_element.get_text().strip()
        else:
            date = "Date not found"
        
        # Try different methods to get the content
        content = []
        
        # Check these containers in order of likelihood
        content_containers = [
            soup.find('article'),  # Most likely place
            soup.find('div', {'id': 'main-content'}),  # Second choice
            soup.find('div', class_=['story-body', 'article-body']),  # Older article format
            soup  # Last resort
        ]
        
        # Use the first container that works
        for container in content_containers:
            if container:
                paragraphs = container.find_all('p')
                if paragraphs:
                    # Clean up the paragraphs
                    for p in paragraphs:
                        # Things we want to skip
                        skip_classes = ['tag', 'share', 'media', 'navigation']
                        skip_phrases = ['Related Topics', 'More on this story', 'SIGN UP', 'Follow us']
                        
                        if not any(class_ in str(p.get('class', [])) for class_ in skip_classes):
                            text = p.get_text().strip()
                            if text and not any(phrase in text for phrase in skip_phrases):
                                content.append(text)
                    break  # Found what we need
        
        # Join everything into one continuous string
        content_text = ' '.join(content)
        content_text = ' '.join(content_text.split())
        
        # Create the final JSON
        results_json = json.dumps({
            'URL': url,
            'Title': title,
            'Date_published': date,
            'Content': content_text
        })
        
        return results_json
        
    except Exception as e:
        # Return error if something goes wrong
        return json.dumps({
            'error': f"Failed to scrape article: {str(e)}"
        })

In [None]:
def extract_entities(string):
    """
    This function should return a json containing the:
    1) People
    2) Places
    3) Organisations 
    in the text string provided.
    """
    try:
        # Try to load spacy model
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        # If model isn't downloaded, get it first
        import subprocess
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
        nlp = spacy.load("en_core_web_sm")
    
    # Process the text through spacy
    doc = nlp(string)
    
    # Set up our collections
    entities = {
        'people': [],
        'places': [],
        'organisations': []
    }
    
    # Sort through what we've found
    for ent in doc.ents:
        # Look for names of people
        if ent.label_ in ['PERSON']:
            if ent.text not in entities['people']:
                entities['people'].append(ent.text)
        # Look for places and locations
        elif ent.label_ in ['GPE', 'LOC']:
            if ent.text not in entities['places']:
                entities['places'].append(ent.text)
        # Look for organisations
        elif ent.label_ in ['ORG']:
            if ent.text not in entities['organisations']:
                entities['organisations'].append(ent.text)
    
    # Package it up as JSON
    entities_json = json.dumps(entities)
    return entities_json

In [None]:
####################################################################
# Test cases 

def test_bbc_scrape():
    # Test the BBC scraper with our article
    scraper_result = bbc_scraper('https://www.bbc.co.uk/news/uk-52255054')
    result_dict = json.loads(scraper_result)
    
    # Show what we got back
    print("\n=== BBC Scraper Test Result ===")
    print(json.dumps(result_dict, indent=2))
    
    # Check we've got all the bits we need
    assert 'URL' in result_dict
    assert 'Title' in result_dict
    assert 'Date_published' in result_dict
    assert 'Content' in result_dict
    
    # Make sure content is properly formatted without breaks
    assert '\n' not in result_dict['Content']
    assert '\r' not in result_dict['Content']
    
    # Check the URL matches what we put in
    assert result_dict['URL'] == 'https://www.bbc.co.uk/news/uk-52255054'

def test_extract_entities_amazon_org():
    # Test finding an organisation
    input_string = "I work for Amazon."
    results_dict = {'people':[],
                    'places':[],
                    'organisations': ['Amazon']
                    }
    extracted_entities_results = extract_entities(input_string)
    print("\n=== Entity Extraction Test 1 ===")
    print(f"Input: {input_string}")
    print("Result:", json.dumps(json.loads(extracted_entities_results), indent=2))
    assert json.loads(extracted_entities_results) == results_dict

def test_extract_entities_name():
    # Test finding a person's name
    input_string = "My name is Bob"
    results_dict = {'people':['Bob'],
                    'places':[],
                    'organisations': []
                    }
    extracted_entities_results = extract_entities(input_string)
    print("\n=== Entity Extraction Test 2 ===")
    print(f"Input: {input_string}")
    print("Result:", json.dumps(json.loads(extracted_entities_results), indent=2))
    assert json.loads(extracted_entities_results) == results_dict

In [None]:
# Run tests
test_bbc_scrape()
test_extract_entities_amazon_org()
test_extract_entities_name()