<h1>Scraping BBC Webpages</h1>

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import os

#web scrape the most relevant link from the search results
def scrape_bbc_article(url_to_scrape):

    driver = webdriver.Chrome()
    #set timeout to 10 seconds
    driver.implicitly_wait(10)
    driver.get(url_to_scrape)

    #find by id='main-content
    main_tag = driver.find_element(By.ID, 'main-content')

    #Find all h1 tags
    #find all div tag with data-component="subheadline-block" or data-component="text-block"
    blocks =main_tag.find_elements(By.TAG_NAME, 'h1')
    blocks = blocks + main_tag.find_elements(By.CSS_SELECTOR, 'div[data-component="subheadline-block"], div[data-component="text-block"]')

    #check that the title contains a digit at least
    #if not, return None
    if 'live' not in url_to_scrape:
        title = blocks[0].text if blocks else None
        if title and not any(char.isdigit() for char in title): #might need to override this at times
            print(f"Title does not contain a digit: {title}")
            driver.quit()
            return None
    #get the text from the blocks and concatenate them in a string, with new line between each block
    text_blocks = [block.text for block in blocks]
    commentary_text = "\n".join(text_blocks)

    #Close the browser
    driver.quit()

    return commentary_text

input_file = "../dataset/epl_sentiment_analysis/bbc_links/bbc_links_2023.json"
output_file = "../dataset/epl_sentiment_analysis/bbc_articles/bbc_articles_2023.json"

# Load the JSON file
with open(input_file, "r") as file:
    links = json.load(file)

#Load output file if it exists, if not none
existing_data = None
if os.path.exists(output_file):
    with open(output_file, "r") as file:
        existing_data = json.load(file)
        print(f"Loaded existing data from {output_file}")

# Initialize an empty list to store the articles
output = []

count = 0
# Iterate through the links and scrape the articles
for link in links:

    
    # Check if the link already exists in the existing data. Use id to check
    if existing_data:
        if any(item['id'] == link['id'] and item.get('article_text') is not None for item in existing_data):
            #add the existing data to the output
            for item in existing_data:
                if item['id'] == link['id']:
                    output.append(item)
            continue
    count += 1
    print(f"Scraping article {count}: {link['bbc_link']}")

    try:
        bbc_link = link.get('bbc_link')
        if bbc_link:
            article_text = scrape_bbc_article(bbc_link)
            output.append({
                'id': link['id'],
                'date': link['date'],
                'home_team': link['home_team'],
                'away_team': link['away_team'],
                'bbc_link': bbc_link,
                'article_text': article_text
            })
        else:
            output.append({'id': link['id'],
                'date': link['date'],
                'home_team': link['home_team'],
                'away_team': link['away_team'],
                'bbc_link': bbc_link,
                'article_text': None})

    except Exception as e:
        print(f"Error scraping {link['bbc_link']}: {e}")
        output.append({
            'id': link['id'],
            'date': link['date'],
            'home_team': link['home_team'],
            'away_team': link['away_team'],
            'bbc_link': link['bbc_link'],
            'article_text': None
        })
        
#output the articles to a json file
with open(output_file, "w") as file:
    json.dump(output, file, indent=4)
print(f"Output saved to {output_file}")

Scraping article 1: https://www.bbc.com/sport/football/66402155
Scraping article 2: https://www.bbc.com/sport/football/66413654
Scraping article 3: https://www.bbc.com/sport/football/66413643
Scraping article 4: https://www.bbc.com/sport/football/66413647
Scraping article 5: https://www.bbc.com/sport/football/66413648
Scraping article 6: https://www.bbc.com/sport/football/66413645
Scraping article 7: https://www.bbc.com/sport/football/66413644
Scraping article 8: https://www.bbc.com/sport/football/66419908
Scraping article 9: https://www.bbc.com/sport/football/66419907
Scraping article 10: https://www.bbc.com/sport/football/66424895
Scraping article 11: https://www.bbc.com/sport/football/66470075
Scraping article 12: https://www.bbc.com/sport/football/66482708
Scraping article 13: https://www.bbc.com/sport/football/66482707
Scraping article 14: https://www.bbc.com/sport/football/66482704
Scraping article 15: https://www.bbc.com/sport/football/66482710
Scraping article 16: https://www.b

Finding articles for corrected entries in output file with articles

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import os

def scrape_bbc_article(url_to_scrape):

    driver = webdriver.Chrome()
    #set timeout to 10 seconds
    driver.implicitly_wait(10)
    driver.get(url_to_scrape)

    #find by id='main-content
    main_tag = driver.find_element(By.ID, 'main-content')

    #Find all h1 tags
    #find all div tag with data-component="subheadline-block" or data-component="text-block"
    blocks =main_tag.find_elements(By.TAG_NAME, 'h1')
    blocks = blocks + main_tag.find_elements(By.CSS_SELECTOR, 'div[data-component="subheadline-block"], div[data-component="text-block"]')

    #check that the title contains a digit at least
    #if not, return None
    """ title = blocks[0].text if blocks else None
    if title and not any(char.isdigit() for char in title): #might need to override this at times
        print(f"Title does not contain a digit: {title}")
        driver.quit()
        return None """
    #get the text from the blocks and concatenate them in a string, with new line between each block
    text_blocks = [block.text for block in blocks]
    commentary_text = "\n".join(text_blocks)

    #Close the browser
    driver.quit()

    return commentary_text

output_file = "../dataset/epl_sentiment_analysis/bbc_articles/bbc_articles_2023.json"

with open(output_file, "r") as file:
    existing_data = json.load(file)
    print(f"Loaded existing data from {output_file}")


output = []
for item in existing_data:
    if item.get('article_text') is None:
        try:
            bbc_link = item.get('bbc_link')
            if bbc_link:
                print(f"Scraping article: {bbc_link}")
                article_text = scrape_bbc_article(bbc_link)
                item['article_text'] = article_text
            else:
                item['article_text'] = None

            output.append(item)
        except Exception as e:
            print(f"Error scraping {item['bbc_link']}: {e}")
            item['article_text'] = None

    else:
        output.append(item)

#save the output to the same file
with open(output_file, "w") as file:
    json.dump(output, file, indent=4)
print(f"Output saved to {output_file}")




Loaded existing data from epl_sentiment_analysis/bbc_articles/bbc_articles_2023.json
Scraping article: https://www.bbc.com/sport/football/66828984
Scraping article: https://www.bbc.com/sport/football/67460118
Scraping article: https://www.bbc.com/sport/football/67604923
Scraping article: https://www.bbc.com/sport/football/67814111
Scraping article: https://www.bbc.com/sport/football/68260176
Scraping article: https://www.bbc.com/sport/articles/cz4xvrz8ep4o
Scraping article: https://www.bbc.com/sport/football/articles/c6pywzp3720o
Output saved to epl_sentiment_analysis/bbc_articles/bbc_articles_2023.json


For Each Match, Find The Relevant Articles Of The Home And Way Team From The Previous Match

In [None]:
""" 
Each entry looks like this:
    {
    "id": 192297,
    "date": "2015-08-08",
    "home_team": "Manchester United",
    "away_team": "Tottenham",
    "bbc_link": "https://www.bbc.com/sport/football/33744640",
    "article_text": "Manchester United 1-0 Tottenham Hotspur\nManchester United held on to start their season with a lacklustre win against Tottenham thanks to Kyle Walker's early own goal.\nAfter Christian Eriksen lobbed over when clean through, United went ahead as Walker poked into his own net when attempting to tackle Wayne Rooney.\nThat led to a better spell for United, who did not have a shot on target until Ashley Young's 65th-minute effort.\nSergio Romero - one of five United debutants - saved a late Eriksen drive.\nRelive Manchester United's win over Tottenham\n, external\nLouis van Gaal celebrated his 64th birthday with a victory but his team were hanging on towards the end as Romero was twice called into action.\nTottenham - who finished fifth in the Premier League last season, six points behind United - started and ended the game strongly only three days after completing their final pre-season game against AC Milan in Germany.\nHow did United's debutants perform?\nMorgan Schneiderlin was the pick of United's new recruits, who cost a combined \u00a383.1m over the summer.\nThe defensive midfielder lined up next to Michael Carrick and looked energetic and willing to accept possession deep in United's half despite a couple of nervous moments early on.\nThe \u00a325m capture from Southampton was put under pressure by goalkeeper Romero, whose kicking lacked conviction after Spurs pressed the hosts' defence high up the pitch.\nThe Argentine international made his debut after replacing David De Gea, who boss Van Gaal said was \"not capable\" of playing amid ongoing speculation that the Spanish goalkeeper wants to join Real Madrid.\nMemphis Depay, a \u00a331m arrival from PSV Eindhoven, played behind Wayne Rooney in attack, and while the Dutch 21-year-old did have an early shot blocked, he was frustrated by his 68th-minute substitution after a display where he looked like he needed further time to gel with team-mates.\nAt right-back Matteo Darmian was solid as a replacement for the departed Rafael Da Silva and showed purpose going forward.\nBastian Schweinsteiger, who came on for Carrick after his \u00a314.4m arrival from Bayern Munich, helped see the game out with some clever touches but was booked shortly after being introduced after an hour.\nKane shows signs of promise\nDespite a gruelling summer in which he went on Spurs' post-season tour to Australia and played in the England Under-21s' ill-fated European Championship campaign, Kane looked sharp in the first half.\nHe made a great start to the game, setting up Eriksen for his side's best chance after five minutes, but a lack of support meant Spurs's top scorer from last season faded and Chris Smalling began to boss the United defence in an impressive display.\nBut the 22-year-old did show promise after questions whether he could follow up last season's superb form.\nTottenham boss Mauricio Pochettino withdrew Nabil Bentaleb eight minutes into the second half after the 20-year-old lined up with Eric Dier in the centre of midfield and gave the ball away for the goal.\nHowever, he will have been pleased by the overall performance of his team, with \u00a312m debutant Toby Alderweireld looking strong next to fellow Belgian Jan Vertonghen in the centre of their defence.\n'Depay too eager'\nManchester United manager Louis Van Gaal: \"I said to Memphis that he doesn't have to play too eagerly and with too much passion and that counts for the whole team. But it's always the same - the first time you play at Old Trafford it's always difficult, but I liked the performance especially from Darmian.\"\nTottenahm defender Kyle Walker on his own goal: \"I tried to stick my foot in the way so that [Rooney would] kick my foot instead of the ball, so it's a bit unlucky really as I'd made up a lot of ground to get there.\"\nVan Gaal on David De Gea's future at Manchester United: \"I'm not the most important part. The most important part is the player and the club who wants to buy him. He's our best player for the past two or three years so we cannot let him go so easily. We want to sell at the same level as we pay. We want to keep him of course.\"\nPundit reaction\nFormer England winger Chris Waddle on BBC Radio 5 live: \"I don't think either team we've watched looked like title contenders. We haven't seen the best of them. They look a little leggy. Tottenham need a bit of pace and with United I'm not sure about them in the centre-back position.\"\nMan of the match\nChris Smalling offered a calmness and reassurance in defence while many debutants tried too hard to impress. He benefitted from the presence of Morgan Schneiderlin in front of him.\nThe stats you need to know\nThis is the first time a Premier League season's first goal has been an own goal\nManchester United have kept a clean sheet on opening day for the first time since 2010 (3-0 v Newcastle)\nUnited have lost just one of their last 24 opening league games of the season at Old Trafford (W17 D6)\nMauricio Pochettino's Spurs are yet to score against Manchester United in three meetings\nManchester United have now won back-to-back Premier League games against Spurs, after a five game winless run (D3 L2)"
}

Transform to this format:
{
    "id": 192297,
    "date": "2015-08-08",
    "home_team": "Manchester United",
    "away_team": "Tottenham",
    "home team_last_match_date": "2015-08-08",
    "away_team_last_match_date": "2015-08-08",
    "bbc_link_home_team_last_match": "https://www.bbc.com/sport/football/33744640",
    "bbc_link_away_team_last_match": "https://www.bbc.com/sport/football/33744640",
    "bbc_article_home_team_last_match" : ".......",
    "bbc_article_away_team_last_match" : ".......",
}
    """


import json
from bisect import bisect_left

input_file = "../dataset/epl_sentiment_analysis/bbc_articles/bbc_articles_2023.json"
output_file = "../dataset/epl_sentiment_analysis/bbc_articles_collated/bbc_collated_articles_2023.json"

# Load the input JSON file
with open(input_file, 'r') as f:
    input_list = json.load(f)

# Preprocess: create a dictionary mapping each team to a sorted list of their matches by date
team_to_matches = {}
for match in input_list:
    home_team = match['home_team']
    away_team = match['away_team']
    for team in [home_team, away_team]:
        if team not in team_to_matches:
            team_to_matches[team] = []
        team_to_matches[team].append(match)

# Sort each team's matches by date
for team in team_to_matches:
    team_to_matches[team].sort(key=lambda x: x['date'])

output_list = []

for current_match in input_list:
    transformed = {
        "id": current_match["id"],
        "date": current_match["date"],
        "home_team": current_match["home_team"],
        "away_team": current_match["away_team"],
        "home_team_last_match_date": None,
        "away_team_last_match_date": None,
        "bbc_link_home_team_last_match": None,
        "bbc_link_away_team_last_match": None,
        "bbc_article_home_team_last_match": None,
        "bbc_article_away_team_last_match": None
    }
    
    current_date = current_match["date"]
    home_team = current_match["home_team"]
    away_team = current_match["away_team"]
    
    # Process home team's last match
    if home_team in team_to_matches:
        home_matches = team_to_matches[home_team]
        dates = [m['date'] for m in home_matches]
        index = bisect_left(dates, current_date) - 1
        if index >= 0:
            # Iterate backwards to find the latest match that is not the current one
            for i in range(index, -1, -1):
                candidate = home_matches[i]
                if candidate['id'] != current_match['id'] and candidate['date'] < current_date:
                    transformed["home_team_last_match_date"] = candidate["date"]
                    transformed["bbc_link_home_team_last_match"] = candidate["bbc_link"]
                    transformed["bbc_article_home_team_last_match"] = candidate["article_text"]
                    break
    
    # Process away team's last match
    if away_team in team_to_matches:
        away_matches = team_to_matches[away_team]
        dates = [m['date'] for m in away_matches]
        index = bisect_left(dates, current_date) - 1
        if index >= 0:
            for i in range(index, -1, -1):
                candidate = away_matches[i]
                if candidate['id'] != current_match['id'] and candidate['date'] < current_date:
                    transformed["away_team_last_match_date"] = candidate["date"]
                    transformed["bbc_link_away_team_last_match"] = candidate["bbc_link"]
                    transformed["bbc_article_away_team_last_match"] = candidate["article_text"]
                    break
    
    output_list.append(transformed)

# Write the transformed list to the output file
with open(output_file, 'w') as f:
    json.dump(output_list, f, indent=4)
