<h1>Sentiment Analysis with LLMs</h1>

Deepseek API

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

list_of_years = [2019] #list of years to process

for year in list_of_years:

    #Import the JSON file, which is a list of objects
    input_directory = f"../dataset/epl_sentiment_analysis/bbc_articles_collated/bbc_collated_articles_{year}.json" #change the year when needed
    output_directory = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_deepseek/deepseek_sentiment_{year}_output.json" #change the year when needed

    with open(input_directory, 'r') as file:
        matches = json.load(file)

    #check if the output file exists, if not define it as an empty list
    if not os.path.exists(output_directory):
        existing_output = []
    else:
        # Load the existing output file
        with open(output_directory, 'r') as file:
            existing_output = json.load(file)

    def query_LLM_API(home_team, away_team, match_date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary):
        try:
            client = OpenAI(
                api_key=os.getenv("DEEPSEEK_API_KEY"), 
                base_url="https://api.deepseek.com/v1",
            )
            completion = client.chat.completions.create(
                model="deepseek-reasoner", 
                messages=[
                    {'role': 'system', 
                    'content': """
                                You are an advanced sentiment analysis assistant. 
                                Your task is to analyze match commentaries in the English Premier League. 
                                The sentiment rating on a certain team should be based on a 5-point scale: Strongly Positive (5), Positive (4), Neutral(3), Negative (2), or Strongly Negative (1)
                                """
                        },
                    {'role': 'user', 
                    'content': f"""
                                I would like you to predict the sentiment for each team for the upcoming match based on what happened in the previous match. 
                                For the upcoming match on {match_date}, {home_team} is the home team and {away_team} is the away team. I have found the BBC commentaries of both teams’ previous matches. 
                                Rate the sentiment of each team on an output on a scale of 1-5. Reply in a JSON format with the following keys: home_team_sentiment, away_team_sentiment. Do not reply with anything else.

                                Below are the articles:
                                The article for {home_team}’s previous match on {home_team_last_match_date}: 
                                {home_team_commentary}
                                
                                The article for {away_team}’s previous match on {away_team_last_match_date}: 
                                {away_team_commentary}.
                                """
                        }],
                )
            # Extract the response content from the completion object
            response_content = completion.choices[0].message.content

            # remove everything before the open curly brace and after the closing curly brace
            response_content = response_content[response_content.find('{'):response_content.rfind('}') + 1]

            #convert the string to a dictionary
            response_dict = json.loads(response_content)

            return response_dict
        
        except Exception as e:
            print(f"Error: {e}")
            return None
        

    #for each object in the list, get the home team, away team, match date, home team last match date, away team last match date, home team commentary, away team commentary

    count = 0 #count will only increase if the API is called

    final_output = [] #list to store the final output

    for match in matches:

        id =  match['id']
        home_team = match['home_team']
        away_team = match['away_team']
        date = match['date']
        home_team_last_match_date = match['home_team_last_match_date']
        away_team_last_match_date = match['away_team_last_match_date']
        home_team_commentary = match['bbc_article_home_team_last_match']
        away_team_commentary = match['bbc_article_away_team_last_match']

        #skip matches without home team commentary or away team commentary
        if not home_team_commentary or not away_team_commentary:
            continue

        #check if the id is already in the output file, and that there is no existing sentiment for both the home team and away team
        if any(existing_match['id'] == id and existing_match['home_team_sentiment'] is not None and existing_match['away_team_sentiment'] is not None for existing_match in existing_output):
            #append the existing match to the final output and skip to the next match
            final_output.append(next(existing_match for existing_match in existing_output if existing_match['id'] == id))
            continue
        
        count += 1
        if count % 5 == 0: #print every 10 matches
            print(f"Called API {count} times")


        # Call the function to query the LLM API
        response = query_LLM_API(home_team, away_team, date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary)
        if response is None:
            print(f"Iteration {count}: Error in API response.")
            final_output.append({
                'id': id,
                'home_team': home_team,
                'away_team': away_team,
                'date': date,
                'home_team_sentiment': None,
                'away_team_sentiment': None
            })
            continue

        

        final_output.append({
            'id': id,
            'home_team': home_team,
            'away_team': away_team,
            'date': date,
            'home_team_sentiment': response['home_team_sentiment'],
            'away_team_sentiment': response['away_team_sentiment']
        })
    

    #output the final output to a JSON file
    with open(output_directory, 'w') as file:
        json.dump(final_output, file, indent=4)


Called API 5 times
Called API 10 times
Called API 15 times
Called API 20 times
Called API 25 times
Called API 30 times
Called API 35 times
Called API 40 times
Called API 45 times
Called API 50 times
Called API 55 times
Called API 60 times
Called API 65 times
Called API 70 times
Called API 75 times
Called API 80 times
Called API 85 times
Called API 90 times
Called API 95 times
Called API 100 times
Called API 105 times
Called API 110 times
Called API 115 times
Called API 120 times
Called API 125 times
Called API 130 times
Called API 135 times
Called API 140 times
Called API 145 times
Called API 150 times
Called API 155 times
Called API 160 times
Called API 165 times
Called API 170 times
Called API 175 times
Called API 180 times
Called API 185 times
Called API 190 times
Called API 195 times
Called API 200 times
Called API 205 times
Called API 210 times
Called API 215 times
Called API 220 times
Called API 225 times
Called API 230 times
Called API 235 times
Called API 240 times
Called API 2

OpenAI

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

list_of_years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023] #list of years to process

for year in list_of_years:

    #Import the JSON file, which is a list of objects
    input_directory = f"../dataset/epl_sentiment_analysis/bbc_articles_collated/bbc_collated_articles_{year}.json" #change the year when needed
    output_directory = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_openai_gpt4.1/openai4.1_sentiment_{year}_output.json" #change the year when needed

    with open(input_directory, 'r') as file:
        matches = json.load(file)

    #check if the output file exists, if not define it as an empty list
    if not os.path.exists(output_directory):
        existing_output = []
    else:
        # Load the existing output file
        with open(output_directory, 'r') as file:
            existing_output = json.load(file)

    def query_LLM_API(home_team, away_team, match_date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary):
        try:
            client = OpenAI(
                api_key= ""
            )
            completion = client.chat.completions.create(
                model="gpt-4.1", 
                messages=[
                    {'role': 'system', 
                    'content': """
                                You are an advanced sentiment analysis assistant. 
                                Your task is to analyze match commentaries in the English Premier League. 
                                The sentiment rating on a certain team should be based on a 5-point scale: Strongly Positive (5), Positive (4), Neutral(3), Negative (2), or Strongly Negative (1)
                                """
                        },
                    {'role': 'user', 
                    'content': f"""
                                I would like you to predict the sentiment for each team for the upcoming match based on what happened in the previous match. 
                                For the upcoming match on {match_date}, {home_team} is the home team and {away_team} is the away team. I have found the BBC commentaries of both teams’ previous matches. 
                                Rate the sentiment of each team on an output on a scale of 1-5. Reply in a JSON format with the following keys: home_team_sentiment, away_team_sentiment. Do not reply with anything else.

                                Below are the articles:
                                The article for {home_team}’s previous match on {home_team_last_match_date}: 
                                {home_team_commentary}
                                
                                The article for {away_team}’s previous match on {away_team_last_match_date}: 
                                {away_team_commentary}.
                                """
                        }],
                )
            # Extract the response content from the completion object
            response_content = completion.choices[0].message.content

            # remove everything before the open curly brace and after the closing curly brace
            response_content = response_content[response_content.find('{'):response_content.rfind('}') + 1]

            #convert the string to a dictionary
            response_dict = json.loads(response_content)

            return response_dict
        
        except Exception as e:
            print(f"Error: {e}")
            return None
        

    #for each object in the list, get the home team, away team, match date, home team last match date, away team last match date, home team commentary, away team commentary

    count = 0 #count will only increase if the API is called

    final_output = [] #list to store the final output

    for match in matches:

        id =  match['id']
        home_team = match['home_team']
        away_team = match['away_team']
        date = match['date']
        home_team_last_match_date = match['home_team_last_match_date']
        away_team_last_match_date = match['away_team_last_match_date']
        home_team_commentary = match['bbc_article_home_team_last_match']
        away_team_commentary = match['bbc_article_away_team_last_match']

        #skip matches without home team commentary or away team commentary
        if not home_team_commentary or not away_team_commentary:
            continue

        #check if the id is already in the output file, and that there is no existing sentiment for both the home team and away team
        if any(existing_match['id'] == id and existing_match['home_team_sentiment'] is not None and existing_match['away_team_sentiment'] is not None for existing_match in existing_output):
            #append the existing match to the final output and skip to the next match
            final_output.append(next(existing_match for existing_match in existing_output if existing_match['id'] == id))
            continue
        
        count += 1
        if count % 10 == 0: #print every 10 matches
            print(f"Called API {count} times")

        """ if count>1: #remove this line to process all matches
            break """

        # Call the function to query the LLM API
        response = query_LLM_API(home_team, away_team, date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary)
        if response is None:
            print(f"Iteration {count}: Error in API response.")
            final_output.append({
                'id': id,
                'home_team': home_team,
                'away_team': away_team,
                'date': date,
                'home_team_sentiment': None,
                'away_team_sentiment': None
            })
            continue

        

        final_output.append({
            'id': id,
            'home_team': home_team,
            'away_team': away_team,
            'date': date,
            'home_team_sentiment': response['home_team_sentiment'],
            'away_team_sentiment': response['away_team_sentiment']
        })
    

    #output the final output to a JSON file
    with open(output_directory, 'w') as file:
        json.dump(final_output, file, indent=4)


Called API 10 times
Called API 20 times
Called API 30 times
Called API 40 times
Called API 50 times
Called API 60 times
Called API 70 times
Called API 80 times
Called API 90 times
Called API 100 times
Called API 110 times
Called API 120 times
Called API 130 times
Called API 140 times
Called API 150 times
Called API 160 times
Called API 170 times
Called API 180 times
Called API 190 times
Called API 200 times
Called API 210 times
Called API 220 times
Called API 230 times
Called API 240 times
Called API 250 times
Called API 260 times
Called API 270 times
Called API 280 times
Called API 290 times
Called API 300 times
Called API 310 times
Called API 320 times
Called API 330 times
Called API 340 times
Called API 350 times
Called API 360 times
Called API 370 times
Called API 10 times
Called API 20 times
Called API 30 times
Called API 40 times
Called API 50 times
Called API 60 times
Called API 70 times
Called API 80 times
Called API 90 times
Called API 100 times
Called API 110 times
Called API

Qwen

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

list_of_years = [2017,2018,2019,2020,2021,2022,2023]  # list of years to process

for year in list_of_years:
    # Import the JSON file
    input_directory = f"../dataset/epl_sentiment_analysis/bbc_articles_collated/bbc_collated_articles_{year}.json"
    output_directory = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_openai/openai_sentiment_{year}_output.json"

    with open(input_directory, 'r') as file:
        matches = json.load(file)

    # Handle existing output
    if not os.path.exists(output_directory):
        existing_output = []
    else:
        with open(output_directory, 'r') as file:
            existing_output = json.load(file)

    def query_LLM_API(home_team, away_team, match_date, 
                     home_team_last_match_date, away_team_last_match_date,
                     home_team_commentary, away_team_commentary):
        try:
            client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Changed to OpenAI API key

            completion = client.chat.completions.create(
                model= "gpt-4o-mini",  # Changed to GPT-4o mini model
                response_format={"type": "json_object"},  # Ensure JSON response
                messages=[
                    {
                        'role': 'system',
                        'content': """You are an advanced sentiment analysis assistant. 
                                     Your task is to analyze match commentaries in the English Premier League. 
                                     The sentiment rating should be based on a 5-point scale: 
                                     Strongly Positive (5), Positive (4), Neutral(3), Negative (2), or Strongly Negative (1)"""
                    },
                    {
                        'role': 'user',
                        'content': f"""Predict the sentiment for each team for the upcoming match based on previous match analysis.
                                     For the match on {match_date}, {home_team} (home) vs {away_team} (away).
                                     Provide JSON output with keys: home_team_sentiment, away_team_sentiment.

                                     Previous matches:
                                     - {home_team} ({home_team_last_match_date}): {home_team_commentary}
                                     - {away_team} ({away_team_last_match_date}): {away_team_commentary}"""
                    }
                ],
                temperature=0.3  # Added for more consistent responses
            )

            # Directly parse JSON response
            response_content = completion.choices[0].message.content
            return json.loads(response_content)

        except Exception as e:
            print(f"Error: {e}")
            return None

    # Rest of the processing remains the same
    final_output = []
    count = 0

    for match in matches:
        id =  match['id']
        home_team = match['home_team']
        away_team = match['away_team']
        date = match['date']
        home_team_last_match_date = match['home_team_last_match_date']
        away_team_last_match_date = match['away_team_last_match_date']
        home_team_commentary = match['bbc_article_home_team_last_match']
        away_team_commentary = match['bbc_article_away_team_last_match']

        #skip matches without home team commentary or away team commentary
        if not home_team_commentary or not away_team_commentary:
            continue

        #check if the id is already in the output file, and that there is no existing sentiment for both the home team and away team
        if any(existing_match['id'] == id and existing_match['home_team_sentiment'] is not None and existing_match['away_team_sentiment'] is not None for existing_match in existing_output):
            #append the existing match to the final output and skip to the next match
            final_output.append(next(existing_match for existing_match in existing_output if existing_match['id'] == id))
            continue
        
        count += 1
        if count % 10 == 0: #print every 10 matches
            print(f"Called API {count} times")

        """ if count>1: #remove this line to process all matches
            break """

        # Call the function to query the LLM API
        response = query_LLM_API(home_team, away_team, date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary)
        if response is None:
            print(f"Iteration {count}: Error in API response.")
            final_output.append({
                'id': id,
                'home_team': home_team,
                'away_team': away_team,
                'date': date,
                'home_team_sentiment': None,
                'away_team_sentiment': None
            })
            continue

        

        final_output.append({
            'id': id,
            'home_team': home_team,
            'away_team': away_team,
            'date': date,
            'home_team_sentiment': response['home_team_sentiment'],
            'away_team_sentiment': response['away_team_sentiment']
        })

    # Save output
    with open(output_directory, 'w') as file:
        json.dump(final_output, file, indent=4)

Qwen

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

# Load environment variables from .env file
load_dotenv()
    
list_of_years = [2019,2020,2021,2022,2023] #list of years to process

for year in list_of_years:

    #Import the JSON file, which is a list of objects
    input_directory = f"../dataset/epl_sentiment_analysis/bbc_articles_collated/bbc_collated_articles_{year}.json" #change the year when needed
    output_directory = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_qwen/qwen_sentiment_{year}_output.json" #change the year when needed

    with open(input_directory, 'r') as file:
        matches = json.load(file)

    #check if the output file exists, if not define it as an empty list
    if not os.path.exists(output_directory):
        existing_output = []
    else:
        # Load the existing output file
        with open(output_directory, 'r') as file:
            existing_output = json.load(file)

    def query_LLM_API(home_team, away_team, match_date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary):
        try:
            client = OpenAI(
                api_key=os.getenv("QWEN_API_KEY"), 
                base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
            )
            completion = client.chat.completions.create(
                model="qwen-plus",
                messages=[
                    {
                        'role': 'system', 
                        'content': """Your system content here"""
                    },
                    {
                        'role': 'user', 
                        'content': f"""
                        I would like you to predict the sentiment for each team for the upcoming match based on what happened in the previous match. 
                        For the upcoming match on {match_date}, {home_team} is the home team and {away_team} is the away team. I have found the BBC commentaries of both teams’ previous matches. 
                        Rate the sentiment of each team on a scale of 1-5. Reply in JSON format in the output below. Your response should be less than 100 characters.

                        A sample output is:
                        {{
                            "home_team_sentiment": 4,
                            "away_team_sentiment": 2
                        }}

                        Below are the articles:
                        The article for {home_team}’s previous match on {home_team_last_match_date}: 
                        {home_team_commentary}
                        
                        The article for {away_team}’s previous match on {away_team_last_match_date}: 
                        {away_team_commentary}.
                        """
                    }
                ],
            )

            # Extract and parse response
            response_content = completion.choices[0].message.content

            # remove everything before the open curly brace and after the closing curly brace
            response_content = response_content[response_content.find('{'):response_content.rfind('}') + 1]

            #convert the string to a dictionary
            response_dict = json.loads(response_content)

            return response_dict
            
        except Exception as e:
            print(f"Error: {e}")
            return None
        

    #for each object in the list, get the home team, away team, match date, home team last match date, away team last match date, home team commentary, away team commentary

    count = 0 #count will only increase if the API is called

    final_output = [] #list to store the final output

    for match in matches:

        id =  match['id']
        home_team = match['home_team']
        away_team = match['away_team']
        date = match['date']
        home_team_last_match_date = match['home_team_last_match_date']
        away_team_last_match_date = match['away_team_last_match_date']
        home_team_commentary = match['bbc_article_home_team_last_match']
        away_team_commentary = match['bbc_article_away_team_last_match']

        #skip matches without home team commentary or away team commentary
        if not home_team_commentary or not away_team_commentary:
            continue

        #check if the id is already in the output file, and that there is no existing sentiment for both the home team and away team
        if any(existing_match['id'] == id and existing_match['home_team_sentiment'] is not None and existing_match['away_team_sentiment'] is not None for existing_match in existing_output):
            #append the existing match to the final output and skip to the next match
            final_output.append(next(existing_match for existing_match in existing_output if existing_match['id'] == id))
            continue
        
        count += 1
        if count % 10 == 0: #print every 10 matches
            print(f"Called API {count} times")

        """ if count>1: #remove this line to process all matches
            break """

        # Call the function to query the LLM API
        response = query_LLM_API(home_team, away_team, date, home_team_last_match_date, away_team_last_match_date, home_team_commentary, away_team_commentary)
        
        if response is None:
            print(f"Iteration {count}: Error in API response.")
            final_output.append({
                'id': id,
                'home_team': home_team,
                'away_team': away_team,
                'date': date,
                'home_team_sentiment': None,
                'away_team_sentiment': None
            })
            continue

        

        final_output.append({
            'id': id,
            'home_team': home_team,
            'away_team': away_team,
            'date': date,
            'home_team_sentiment': response['home_team_sentiment'],
            'away_team_sentiment': response['away_team_sentiment']
        })
    

    #output the final output to a JSON file
    with open(output_directory, 'w') as file:
        json.dump(final_output, file, indent=4)   

Combining All 3 LLM Results

In [None]:
import json

years_to_combine = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023] #list of years to process

for year in years_to_combine:

    #open up all 3 files first
    deepseek_file = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_deepseek/deepseek_sentiment_{year}_output.json"
    openai_file = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_openai/openai_sentiment_{year}_output.json"
    qwen_file = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_qwen/qwen_sentiment_{year}_output.json"
    openai41_file = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_openai_gpt4.1/openai4.1_sentiment_{year}_output.json"

    with open(deepseek_file, 'r') as file:
        deepseek_matches = json.load(file)

    with open(openai_file, 'r') as file:
        openai_matches = json.load(file)

    with open(qwen_file, 'r') as file:
        qwen_matches = json.load(file)

    with open(openai41_file, 'r') as file:
        openai41_matches = json.load(file)

    combined_output = [] #list to store the final output

    for match in deepseek_matches:
        id = match['id']
        home_team = match['home_team']
        away_team = match['away_team']
        date = match['date']
        
        # Find corresponding matches in OpenAI and Qwen files
        openai_match = next((m for m in openai_matches if m['id'] == id), None)
        qwen_match = next((m for m in qwen_matches if m['id'] == id), None)
        openai41_match = next((m for m in openai41_matches if m['id'] == id), None)


        # Combine the data
        combined_output.append({
            'id': id,
            'home_team': home_team,
            'away_team': away_team,
            'date': date,
            'deepseek_home_team_sentiment': match['home_team_sentiment'],
            'deepseek_away_team_sentiment': match['away_team_sentiment'],
            'deepseek_home-away_sentiment': match['home_team_sentiment'] - match['away_team_sentiment'],
            'openai_home_team_sentiment': openai_match['home_team_sentiment'] if openai_match else None,
            'openai_away_team_sentiment': openai_match['away_team_sentiment'] if openai_match else None,
            'openai_home-away_sentiment': openai_match['home_team_sentiment'] - openai_match['away_team_sentiment'],
            'qwen_home_team_sentiment': qwen_match['home_team_sentiment'] if qwen_match else None,
            'qwen_away_team_sentiment': qwen_match['away_team_sentiment'] if qwen_match else None,
            'qwen_home-away_sentiment':qwen_match['home_team_sentiment'] - qwen_match['away_team_sentiment'],
            'openai4.1_home_team_sentiment': openai41_match['home_team_sentiment'] if openai41_match else None,
            'openai4.1_away_team_sentiment': openai41_match['away_team_sentiment'] if openai41_match else None,
            'openai4.1_home-away_sentiment': openai41_match['home_team_sentiment'] - openai41_match['away_team_sentiment']
        })

    #output the final output to a JSON file
    combined_output_file = f"../dataset/epl_sentiment_analysis/sentiment_analysis_results_combined/combined_sentiment_{year}_output.json"
    with open(combined_output_file, 'w') as file:
        json.dump(combined_output, file, indent=4)
