In [0]:
# Import necessary libraries
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json
import pyspark.sql.functions as F
from pyspark.sql.functions import explode

In [0]:
# Set up SparkSession
spark = SparkSession.builder \
    .appName("NewsSentimentAnalysis") \
    .getOrCreate()

If the current NewsAPI endpoint (X-Api-Key) reaches its daily request limit, we'll need to obtain a new API key (X-Api-Key) to ensure continued functionality.

In [0]:
# Define a function to fetch financial news titles from NewsAPI
def fetch_financial_news_titles():
    url = "https://newsapi.org/v2/top-headlines"
    headers = {"X-Api-Key": "fcd47f104ff24845a960fe6371ef3f31"}
    params = {
        "category": "business",  # Filter by business category for financial news
        "country": "us",         # Adjust country as needed
        "pageSize": 100          # Increase page size to get more results
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        news_data = response.json()
        if "articles" in news_data:
            titles = [article['title'] for article in news_data['articles']]
            print(titles)  # Print the fetched titles
            return titles
        else:
            return None
    else:
        return None


In [0]:
fetch_financial_news_titles()

200
['Why has hiring plunged for white-collar jobs? - CBS News', 'Apple iPhone weakness in China is overdone, says Wells Fargo analyst - CNBC Television', 'Read the wild email Tesla is sending to suppliers amid Supercharger chaos - Electrek.co', 'Have the wheels come off for Tesla? - BBC.com', 'Chances Of Skydance Takeover Of Paramount Fades; Competing Sony Deal Reportedly Unlikely Too - TrekMovie', "Here's how much money you'd have lost if you invested $1,000 in Peloton when it went public - CNBC", 'Trump Media audit firm charged with fraud - Salon', 'Microsoft ties executive pay to security following multiple failures and breaches - Ars Technica', 'Fed-Obsessed Traders Need the Economy to Get Its Story Straight - Yahoo Finance', 'Cybertruck owner crushes finger with car door to prove odd point - SFGATE', 'Carvana stock rebound boosts fortunes of CEO and his dad by billions - Fox Business', "Apple remains Buffett's biggest public stock holding, but his thesis about its moat faces ques

In [0]:
# Define a UDF (User Defined Function) to fetch financial news titles periodically
fetch_news_udf = F.udf(fetch_financial_news_titles, "array<string>")

In [0]:
# Schedule the function to run once per day at a specific time
schedule.every().day.at("08:00").do(fetch_financial_news_titles)

In [0]:
fetch_news_udf

Out[24]: <function __main__.fetch_financial_news_titles()>

In [0]:
news_df = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 1) \
    .load() \
    .withColumn("news", fetch_news_udf())

In [0]:
exploded_df = news_df.select("timestamp", explode("news").alias("news"))

In [0]:
# Start the streaming query and write to Parquet files
query = exploded_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("checkpointLocation", "/FileStore/finalproject/checkpoint_location") \
    .option("path", "/FileStore/finalproject/parquet_files") \
    .start()

query.awaitTermination()