<h1>Getting Historical News Data With Deepseek</h1>



Getting The Relevant BBC URLs

In [None]:
import os
import json
import requests
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# API Configuration for Google Custom Search
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX = os.getenv("GOOGLE_CX_2")
GOOGLE_API_URL = "https://www.googleapis.com/customsearch/v1"

# API Configuration for DeepSeek
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
}

def call_deepseek_reasoner(prompt, system_message=None):
    try:
        messages = []
        
        if system_message:
            messages.append({"role": "system", "content": system_message})
            
        messages.append({"role": "user", "content": prompt})

        payload = {
            "model": "deepseek-chat",  # Corrected model name
            "messages": messages,
            "temperature": 0.3,
        }

        response = requests.post(DEEPSEEK_API_URL, json=payload, headers=HEADERS)
        response.raise_for_status()
        
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {str(e)}")
        if e.response is not None:
            print(f"Error details: {e.response.text}")  # Added error details
        return None

def get_google_search_results(query):
    try:
        encoded_query = requests.utils.quote(query)
        webpage_url = f"https://www.googleapis.com/customsearch/v1?gl=sg&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}&q={encoded_query}"
        response=requests.get(webpage_url)
        response.raise_for_status()
        search_results = response.json()
        search_result_string = ''

        # Print the search results
        for item in search_results.get("items", []):
            search_result_string += f"Title: {item['title']}"
            search_result_string += f"\nLink: {item['link']}"
            search_result_string += f"\nSnippet: {item['snippet']}"
            search_result_string += "\n\n"

        return search_result_string
    
    except requests.exceptions.RequestException as e:
        print(f"Google API request failed: {str(e)}")
        if e.response is not None:
            print(f"Error details: {e.response.text}")
        return None

def get_bbc_link(home_team, away_team, date):
    print(f"Searching for {home_team} vs {away_team} EPL match on {date}...")
    #convert the date to the format of 16 may 2016
    month_mapping = {
        "01": "jan",
        "02": "feb",
        "03": "mar",
        "04": "apr",
        "05": "may",
        "06": "jun",
        "07": "jul",
        "08": "aug",
        "09": "sep",
        "10": "oct",
        "11": "nov",
        "12": "dec"
    }
    date_parts = date.split("-")
    day = date_parts[2]
    month = month_mapping[date_parts[1]]
    year = date_parts[0]
    date = f"{day} {month} {year}"

    search_query = f"{home_team} vs {away_team} {date} BBC EPL"
    system_prompt = "You are a soccer fan."
    user_prompt = f"""Find the BBC article about {home_team} vs {away_team} EPL match on {date}.
    Make sure that {home_team} is the home team and {away_team}is the away team.
    Reply with only the URL of the article which you think is the most relevant and gives a commentary about the match. No explanation is needed.
    The scoreline of the match should be seen in the title and/or the snippet.
    The article date should be the same as the match date or the day after.
    The URL should be in the form https://www.bbc.com/sport/football/123456. There should not be 'live' in the URL.
    If you are not confident that any of the search results are relevant, reply with 'None'.\n"""


    # Get Google search results
    search_results = get_google_search_results(search_query)
    if not search_results:
        return None
    if search_results:
        deepseek_response = call_deepseek_reasoner(user_prompt, system_prompt+search_results)

        if deepseek_response:
            final_result = deepseek_response['choices'][0]['message']['content']

            if 'none' in final_result.casefold():
                return None
            else:
                return final_result
        else:
            print("No response from DeepSeek API.")
            return None

file_to_refer = "../dataset/fixtures_epl/fixtures_epl_2023.json"
output_file = "../dataset/epl_sentiment_analysis/bbc_links/bbc_links_2023.json"

#do not waste API calls, loading JSON file if it exists first
existing_data = None
if os.path.exists(output_file):
    with open(output_file, "r") as file:
        existing_data = json.load(file)
        print(f"Loaded existing data from {output_file}")


fixtures = None
with open(file_to_refer, "r") as file:
    fixtures = json.load(file)
output = []


for fixture in fixtures:
    output_to_append = {
        'id': fixture['fixture']['id'],
        'date': fixture['fixture']['date'][:10],
        'home_team': fixture['teams']['home']['name'],
        'away_team': fixture['teams']['away']['name'],
        'timestamp': fixture['fixture']['timestamp']
    }

    # Check if the fixture already exists in the existing data
    if existing_data:
        #see if there is a fixture with the same id, and that the fixture has a link that is not None
        if any(item['id'] == output_to_append['id'] and item.get('bbc_link') is not None for item in existing_data):
            print(f"Fixture {output_to_append['id']} already exists in the output file.")
            #add the existing data to the output
            for item in existing_data:
                if item['id'] == output_to_append['id']:
                    output.append(item)
            continue


    try:
        # Call the function to get the BBC link
        bbc_link = get_bbc_link(output_to_append['home_team'], output_to_append['away_team'], output_to_append['date'])
        if bbc_link:
            output_to_append['bbc_link'] = bbc_link
        else:
            output_to_append['bbc_link'] = None
        output.append(output_to_append)
    except Exception as e:
        print(f"Error processing fixture {output_to_append['id']}: {str(e)}")
        output_to_append['bbc_link'] = None
        output.append(output_to_append)



#save the output to a json file

with open(output_file, "w") as file:
    json.dump(output, file, indent=4)
print(f"Output saved to {output_file}")


Searching for Burnley vs Manchester City EPL match on 2023-08-11...
Searching for Arsenal vs Nottingham Forest EPL match on 2023-08-12...
Searching for Bournemouth vs West Ham EPL match on 2023-08-12...
Searching for Everton vs Fulham EPL match on 2023-08-12...
Searching for Brighton vs Luton EPL match on 2023-08-12...
Searching for Sheffield Utd vs Crystal Palace EPL match on 2023-08-12...
Searching for Newcastle vs Aston Villa EPL match on 2023-08-12...
Searching for Brentford vs Tottenham EPL match on 2023-08-13...
Searching for Chelsea vs Liverpool EPL match on 2023-08-13...
Searching for Manchester United vs Wolves EPL match on 2023-08-14...
Searching for Nottingham Forest vs Sheffield Utd EPL match on 2023-08-18...
Searching for Fulham vs Brentford EPL match on 2023-08-19...
Searching for Wolves vs Brighton EPL match on 2023-08-19...
Searching for Liverpool vs Bournemouth EPL match on 2023-08-19...
Searching for Tottenham vs Manchester United EPL match on 2023-08-19...
Searching 

<h4>Fake URL Checker And Cleaner</h4>
Sometimes, Deepseek might hallucinate and give a fake URL. The goal is to detect fake URLs and replace with None.

In [None]:
import json
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def validate_bbc_links(filename):
    # Read the JSON file
    with open(filename, 'r') as f:
        fixtures = json.load(f)

    checked = []
    count = 0
    # Iterate through each fixture to check the BBC link
    for fixture in fixtures:
        count += 1
        if count % 10 == 0:
            print(f"Checked {count} fixtures.")
        bbc_link = fixture.get('bbc_link')
        if bbc_link and bbc_link != 'None':
            try:
                response = requests.get(bbc_link, headers=headers, timeout=10)
                if response.status_code != 200:
                    fixture['bbc_link'] = None
                    checked.append(fixture)
                    print(f"Link {bbc_link} is not valid. Setting link to None.")
                else:
                    checked.append(fixture)

            except requests.exceptions.RequestException:
                fixture['bbc_link'] = None
                checked.append(fixture)
                print(f"Error accessing {bbc_link}. Setting link to None.")
        else:
            fixture['bbc_link'] = None
            checked.append(fixture)
    
    # Write the updated data back to the JSON file
    with open(filename, 'w') as f:
        json.dump(checked, f, indent=4)

# Example usage:
validate_bbc_links("../dataset/epl_sentiment_analysis/bbc_links/bbc_links_2023.json")

Link https://www.bbc.com/sport/football/66464319 is not valid. Setting link to None.
Checked 10 fixtures.
Link https://www.bbc.com/sport/football/66476948 is not valid. Setting link to None.
Checked 20 fixtures.
Checked 30 fixtures.
Checked 40 fixtures.
Checked 50 fixtures.
Checked 60 fixtures.
Link https://www.bbc.com/sport/football/66971940 is not valid. Setting link to None.
Checked 70 fixtures.
Link https://www.bbc.com/sport/football/67022448 is not valid. Setting link to None.
Checked 80 fixtures.
Link https://www.bbc.com/sport/football/67172434 is not valid. Setting link to None.
Checked 90 fixtures.
Checked 100 fixtures.
Link https://www.bbc.com/sport/football/67306439 is not valid. Setting link to None.
Link https://www.bbc.com/sport/football/67298848 is not valid. Setting link to None.
Link https://www.bbc.com/sport/football/67298837 is not valid. Setting link to None.
Link https://www.bbc.com/sport/football/67296611 is not valid. Setting link to None.
Link https://www.bbc.com