# Import Package And Dependencies

In [118]:
# !pip install requests
# !pip install pandas
# !pip install beautifulsoup4
# !pip install --upgrade yfinance
# !pip install nltk
# !pip install spacy
# !pip install matplotlib
# !pip install wordcloud
# !pip install vaderSentiment
# !pip install -U textblob

In [119]:
import pandas as pd
import time
import random
import requests
import json
import re

In [120]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

# Load DATASET into Pandas Dataframe

In [121]:
# https://www.kaggle.com/code/gpreda/bbc-news-rss-feeds?scriptVersionId=211092399

In [122]:
def save_dataframe(df: pd.DataFrame, file_path: str = 'bbc_news_updated.csv',) -> None:
    try:
        df.to_csv(file_path, index=False)
        print(f"News DataFrame saved to {file_path}")
    except Exception as e:
        print(f"Error: Failed to save news DataFrame: {e}")

### Load SpaCy Model

In [123]:
import spacy

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Spacy model 'en_core_web_sm' not found. Please run:")
    print("python -m spacy download en_core_web_sm")
    exit()

### Load Data

In [124]:
try:
    df = pd.read_csv('bbc_news_updated.csv')
except FileNotFoundError:
    print("The file 'bbc_news_updated.csv' was not found.")
    exit()

### Extract Company Names

In [125]:
def get_company_names(text):
    """
    Extracts company names (entities with label 'ORG') from a given text.
    """
    # Process the text with Spacy
    doc = nlp(text)
    # Extract entities labeled as 'ORG' (Organization)
    companies = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
    # Return a comma-separated string of unique company names
    return ", ".join(list(set(companies)))

### Load / Save Company Data into JSON

In [126]:
def load_company_search_history(file_path: str = 'company_date.json') -> dict:
    try:
        with open(file_path, 'r') as json_file:
            return json.load(json_file)
    except Exception as e:
        print(f"Error: Failed to load company search history: {e}")
        return {}

In [127]:
def save_company_search_history(file_path: str = 'company_date.json') -> None:
    """
    Saves the company search history to a JSON file.

    Args:
        file_path (str): The path to the JSON file.
    """
    try:
        with open(file_path, 'w') as json_file:
            json.dump(company_search_history, json_file, indent=4)
        print(f"Company search history saved to {file_path}")
    except Exception as e:
        print(f"Error: Failed to save company search history: {e}")

In [128]:
company_search_history = load_company_search_history()

Error: Failed to load company search history: [Errno 2] No such file or directory: 'company_date.json'


### Get Stock Symbol

In [129]:
def generate_random_user_agent() -> str:
    """
    Generates a random User-Agent string.

    Returns:
        str: A random User-Agent string.
    """

    # Common OS and browser combinations. Expand this list as needed.
    os_list = [
        ("Windows NT 10.0; Win64; x64", "Windows"),
        ("Macintosh; Intel Mac OS X 10_15_7", "Mac"),
        ("X11; Linux x86_64", "Linux"),
        ("X11; Ubuntu; Linux x86_64", "Ubuntu"),
        ("X11; CrOS x86_64 14541.0.0", "ChromeOS"),
        ("iPhone; CPU iPhone OS 16_0 like Mac OS X", "iOS"),
        ("Android 10; Mobile", "Android"),
    ]

    browser_list = [
        ("Chrome", "Chrome/{major_version}.{minor_version}.{build_version}.{patch_version} Safari/537.36"),
        ("Firefox", "Firefox/{major_version}.{minor_version}"),
        ("Safari", "Version/{major_version}.0 Safari/605.1.15"),
        ("Edge", "Edg/{major_version}.{minor_version}.{build_version}.{patch_version}"),
        ("Opera", "Opera/{major_version}.0 (Windows NT 10.0; Win64; x64) Presto/2.12.388 Version/12.16"),
    ]

    os_info = random.choice(os_list)
    browser_info = random.choice(browser_list)

    os_string = os_info[0]
    browser_name = browser_info[0]
    browser_version_template = browser_info[1]

    # Generate random version numbers.
    major_version = random.randint(70, 115)  # Adjust range as needed
    minor_version = random.randint(0, 9999)
    build_version = random.randint(0, 9999)
    patch_version = random.randint(0, 999)

    # Format the browser version string
    if browser_name == "Safari":
        safari_major = random.randint(10, 16)
        browser_version = browser_version_template.format(major_version=safari_major)
    elif browser_name == "Opera":
        browser_version = browser_version_template.format(major_version=major_version)
    else:
        browser_version = browser_version_template.format(
            major_version=major_version,
            minor_version=minor_version,
            build_version=build_version,
            patch_version=patch_version
        )

    user_agent = f"Mozilla/5.0 ({os_string}) AppleWebKit/537.36 (KHTML, like Gecko) {browser_version}"

    return user_agent

In [130]:
# Api Count used to logging purpose to undertand how many times yahoo finance api is called
api_count = 0

max_try = 3

def get_ticker(company_name: str) -> str:
    """
    Retrieves the ticker symbol for a given company name.

    Args:
        company_name (str): The name of the company.

    Returns:
        str: The ticker symbol for the company, or an empty string if not found.
    """
    try:
        global max_try
        # Check if the company name is already in the search history
        if company_name in company_search_history:
            return company_search_history[company_name]

        # Set up the API request
        yfinance_url = "https://query2.finance.yahoo.com/v1/finance/search"
        user_agent = generate_random_user_agent()
        params = {
            "q": company_name,
            "quotes_count": 1,
            "country": "United Kingdom"
        }

        print(f"Searching for: {params}")

        # Send the API request
        response = requests.get(url=yfinance_url, params=params, headers={"User-Agent": user_agent})

        # Check if the response was successful
        if response.status_code != 200:
            if max_try == 0:
                print(f"Max Try Reached: {params}")
                return ""
            max_try -= 1
            print(f"Failed to retrieve data: {response.status_code}")
            time.sleep(10)  # Wait 10 seconds before retrying
            return get_ticker(company_name)  # Retry the request

        # Parse the response data
        data = response.json()

        # Reset Max Try
        max_try = 3
        # Increment the API count
        global api_count
        api_count += 1

        # Add a delay to avoid overwhelming the API, and timeout error from api
        time.sleep(5)

        # Check if the company was found
        if data["quotes"]:
            company_code = data["quotes"][0]["symbol"]
            company_search_history[company_name] = company_code
            return company_code
        else:
            company_search_history[company_name] = ""
            return ""

    except Exception as e:
        print(f"An exception occurred: {e}")
        time.sleep(10)  # Wait 10 seconds before retrying
        return get_ticker(company_name)  # Retry the request

### Validate Company Name

In [131]:
def find_and_validate_companies(row):
    """
    Processes a row of the DataFrame, validates company names,
    and finds their stock symbols.
    """
    # Get the string of company names and split it into a list
    original_companies = str(row['company_name']).split(', ')

    validated_companies = []
    stock_symbols = []

    for company in original_companies:
        # Clean up the company name
        company = company.strip()
        if company:
            # In a real application, you would call a search API here
            global max_try
            max_try = 3
            symbol = get_ticker(company)

            if symbol:
                validated_companies.append(company)
                stock_symbols.append(symbol)

    # Return as comma-separated strings
    return ", ".join(validated_companies), ", ".join(stock_symbols)

### Get Sentiment

In [132]:
from textblob import TextBlob

def get_sentiment(text):
    """
    Analyzes the sentiment of a given text and returns 'positive', 'negative', or 'neutral'.
    """
    if not isinstance(text, str):
        return 'neutral', 0.0

    # Create a TextBlob object
    blob = TextBlob(text)
    # Get the polarity score
    polarity = blob.sentiment.polarity
    # Classify the sentiment
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment, polarity

### Apply Function To Data Frame

In [133]:
# Ensure 'full_description' column exists and handle potential missing values
df = df.iloc[:10,:]

if 'full_description' in df.columns:
    df['full_description'] = df['full_description'].astype(str) # Ensure all values are strings
    df['company_name'] = df['full_description'].apply(get_company_names)
    df[['validated_companies', 'stock_symbols']] = df.apply(find_and_validate_companies, axis=1, result_type='expand')
    sentiment_results = df['full_description'].apply(get_sentiment)
    df['news_sentiment'] = sentiment_results.apply(lambda x: x[0])
    df['sentiment_score'] = sentiment_results.apply(lambda x: x[1])
else:
    print("The 'full_description' column was not found in the CSV file.")
    exit()

Searching for: {'q': 'Yara', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'Yara', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'Yara', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'Yara', 'quotes_count': 1, 'country': 'United Kingdom'}
Max Try Reached: {'q': 'Yara', 'quotes_count': 1, 'country': 'United Kingdom'}
Searching for: {'q': 'EXPLAINED', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'EXPLAINED', 'quotes_count': 1, 'country': 'United Kingdom'}
Searching for: {'q': 'Yara International', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'Yara International', 'quotes_count': 1, 'country': 'United Kingdom'}
Failed to retrieve data: 429
Searching for: {'q': 'Yara International', 'quotes_count': 1, 'country': 'United Kingdom'}
Fa

KeyboardInterrupt: 

### Save CSV

In [36]:
# Step 6: Save the new DataFrame to a CSV file
output_filename = 'bbc_news_with_sentiment_and_companies.csv'
save_dataframe(df, output_filename)

News DataFrame saved to bbc_news_with_sentiment_and_companies.csv


In [37]:
print(f"Successfully processed the data and saved it to '{output_filename}'")
print("\nHere are the first 5 rows of the updated data with company names and sentiment:")
print(df[['title', 'company_name', 'news_sentiment']].head())

Successfully processed the data and saved it to 'bbc_news_with_sentiment_and_companies.csv'

Here are the first 5 rows of the updated data with company names and sentiment:
                                               title                                       company_name news_sentiment
0         Ukraine war 'catastrophic for global food'  Yara, EXPLAINED, Yara International, BBC, Nutr...       positive
1  Ukraine conflict: Oil price soars to highest l...  EXPLAINED, the White House, EU, BBC, FTSE, Dow...       positive
2  TikTok limits services as Netflix pulls out of...  American Express, EXPLAINED, Meta, BBC, Kremli...       negative
3     Five ways the Ukraine war could push up prices  Volkswagen, Toyota, the US Federal Reserve, EU...       negative
4  The sophisticated tech predicting if an advert...                                                           neutral
