## Importing Necessary Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

from transformers import pipeline

## **Apple News Dataset - 1**

> Fetching News from Apple Website for the period 2000-2025

In [2]:
BASE_URL = "https://www.apple.com"

    """Fetch a single page of Apple's newsroom archive for a given year."""

In [3]:
def fetch_year_page(year, page=1):
    url = f"{BASE_URL}/newsroom/archive/{year}/"
    if page>1:
        url += f"?page={page}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        print(f"✅ Successfully fetched page-{page} for {year} - {url}")
        return resp.text
    else:
        print(f"❌ Failed to fetch first page for {year} (Status {resp.status_code})")
        return None

    """Find total pages from the pagination element (like '1 of 9')."""

In [4]:
def get_total_pages(soup):
    total_span = soup.find("span", class_="pagination-total")
    if total_span:
        try:
            total_pages = int(total_span.get_text(strip=True))
            return total_pages
        except:
            pass
    return 1

    """Extract headlines, dates, and links from a single page's HTML."""

In [5]:
def parse_page(html, year):
    soup = BeautifulSoup(html, "html.parser")
    results = []

    # Extract news items
    for item in soup.find_all("a", class_="result__item"):
        headline = item.find("h3", class_="item__headline").get_text(strip=True)
        date = item.find("p", class_="item__date").get_text(strip=True)
        category = item.find("p", class_="item__category").get_text(strip=True) if item.find("p", class_="item__category") else None
        link = BASE_URL + item["href"]

        results.append({"date": date, "category": category, "headline": headline, "url": link})

        return results, soup

In [6]:
def scrape_apple_newsroom(start_year=2000, end_year=2001):
    all_results = []
    for year in range(start_year, end_year + 1):
        print(f"\n🔍 Processing year: {year}")
        
        # 1️⃣ Fetch first page
        html = fetch_year_page(year, page=1)
        if not html:
            print(f"⚠️ Failed to fetch year {year}, skipping.")
            continue
        
        page_results, soup = parse_page(html, year)
        all_results.extend(page_results)
        
        # 2️⃣ Find total number of pages from "1 of N"
        total_pages = get_total_pages(soup)
        print(f"📄 Total pages for {year}: {total_pages}")
        
        # 3️⃣ Iterate remaining pages
        for page in range(2, total_pages + 1):
            time.sleep(1)
            html = fetch_year_page(year, page)
            if not html:
                print(f"⚠️ Failed to fetch page {page} for {year}, stopping.")
                break
            page_results, _ = parse_page(html, year)
            if not page_results:
                break
            all_results.extend(page_results)
            
    print(f"\n🎯 Total news items collected: {len(all_results)}")
    return pd.DataFrame(all_results)

In [7]:
df = scrape_apple_newsroom(2000, 2025)
df.to_csv("news/apple_news_2000_2025.csv", index=False)


🔍 Processing year: 2000
✅ Successfully fetched page-1 for 2000 - https://www.apple.com/newsroom/archive/2000/
📄 Total pages for 2000: 9
✅ Successfully fetched page-2 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=2
✅ Successfully fetched page-3 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=3
✅ Successfully fetched page-4 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=4
✅ Successfully fetched page-5 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=5
✅ Successfully fetched page-6 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=6
✅ Successfully fetched page-7 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=7
✅ Successfully fetched page-8 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=8
✅ Successfully fetched page-9 for 2000 - https://www.apple.com/newsroom/archive/2000/?page=9

🔍 Processing year: 2001
✅ Successfully fetched page-1 for 2001 - https://www.apple.com/newsroom/archive/2001/
📄 Total 

In [8]:
news = pd.read_csv("news/apple_news_2000_2025.csv")

In [9]:
news.head()

Unnamed: 0,date,category,headline,url
0,"December 19, 2000",MEDIA ALERT,Apple at Macworld Expo San Francisco 2001,https://www.apple.com/newsroom/2000/12/19Apple...
1,"October 9, 2000",PRESS RELEASE,Apple Announces Mandich’s Retirement,https://www.apple.com/newsroom/2000/10/09Apple...
2,"August 10, 2000",PRESS RELEASE,Lucasfilm and Apple Bring Star Wars: Episode I...,https://www.apple.com/newsroom/2000/08/10Lucas...
3,"July 19, 2000",PRESS RELEASE,Apple Unveils All New Family of Displays to Co...,https://www.apple.com/newsroom/2000/07/19Apple...
4,"June 6, 2000",PRESS RELEASE,IDC Confirms Apple is Number One in U.S. & Wor...,https://www.apple.com/newsroom/2000/06/06IDC-C...


In [10]:
news['date'] = pd.to_datetime(news['date'], format="%B %d, %Y")

In [11]:
news = news.sort_values(by='date', ascending=True).reset_index(drop=True)
news['date'].is_monotonic_increasing

True

In [12]:
news.head()

Unnamed: 0,date,category,headline,url
0,2000-01-19,PRESS RELEASE,Apple Reports First Quarter Profit of $183 Mil...,https://www.apple.com/newsroom/2000/01/19Apple...
1,2000-02-29,PRESS RELEASE,Apple Introduces New Apple Learning Solutions ...,https://www.apple.com/newsroom/2000/02/29Apple...
2,2000-04-05,PRESS RELEASE,Apple Teams with Kanisa for e-Service,https://www.apple.com/newsroom/2000/04/05Apple...
3,2000-04-28,PRESS RELEASE,Apple Offers iMovie as Free Download for Power...,https://www.apple.com/newsroom/2000/04/28Apple...
4,2000-06-06,PRESS RELEASE,IDC Confirms Apple is Number One in U.S. & Wor...,https://www.apple.com/newsroom/2000/06/06IDC-C...


In [13]:
news.tail()

Unnamed: 0,date,category,headline,url
249,2025-06-09,PRESS RELEASE,Apple Intelligence gets even more powerful wit...,https://www.apple.com/newsroom/2025/06/apple-i...
250,2025-07-15,UPDATE,Apple expands U.S. supply chain with $500 mill...,https://www.apple.com/newsroom/2025/07/apple-e...
251,2025-08-06,PRESS RELEASE,"Apple, Corning to manufacture all iPhone, Appl...",https://www.apple.com/newsroom/2025/08/apple-c...
252,2025-09-09,PRESS RELEASE,"Apple debuts Apple Watch Series 11, featuring ...",https://www.apple.com/newsroom/2025/09/apple-d...
253,2025-09-19,PHOTOS,"The latest iPhone, Apple Watch, and AirPods Pr...",https://www.apple.com/newsroom/2025/09/the-lat...


In [14]:
news.drop(columns=["category", "url"], inplace=True)

## **Apple news dataset - 2**

[Kaggle Link](https://www.kaggle.com/datasets/frankossai/apple-stock-aapl-historical-financial-news-data)

In [15]:
apple_ds = pd.read_csv("news/apple_news_data.csv")
apple_ds.head()

Unnamed: 0,date,title,content,link,symbols,tags,sentiment_polarity,sentiment_neg,sentiment_neu,sentiment_pos
0,2024-11-27T16:39:00+00:00,Berkshire Stock Hits Record Even as Company Re...,"Warren Buffett’s caution, his advancing age, a...",https://finance.yahoo.com/m/f5df3aa4-364b-31d6...,"0R2V.IL, AAPL.BA, AAPL.MX, AAPL.NEO, AAPL.SN, ...",,0.0,0.0,1.0,0.0
1,2024-11-26T00:00:00+00:00,What Is a Stock Market Index?,What Is a Stock Market Index?,https://www.fool.com/investing/stock-market/in...,"AAPL.US, AMZN.US, MSFT.US",,0.0,0.0,1.0,0.0
2,2024-11-26T00:00:00+00:00,"Could Investing $1,000 in Apple Make You a Mil...","Could Investing $1,000 in Apple Make You a Mil...",https://www.fool.com/investing/2024/11/26/coul...,AAPL.US,,0.0,0.0,1.0,0.0
3,2024-11-26T00:00:00+00:00,Dow Jones Industrial Average,Dow Jones Industrial Average,https://www.fool.com/investing/stock-market/in...,"AAPL.US, AMGN.US, AMZN.US, CSCO.US, GOOG.US, G...",,0.0,0.0,1.0,0.0
4,2024-11-26T00:00:00+00:00,What Is the S&P 500 Index?,What Is the S&P 500 Index?,https://www.fool.com/investing/stock-market/in...,"AAPL.US, AMZN.US, GOOG.US, GOOGL.US, META.US, ...",,0.0,0.0,1.0,0.0


In [16]:
apple_ds['date'] = pd.to_datetime(apple_ds['date']).dt.tz_localize(None)

In [17]:
apple_ds['headline'] = apple_ds['title']
apple_ds['url'] = apple_ds['link']

In [18]:
apple_ds.drop(columns=["title", "link", "tags", "symbols", "content", "sentiment_polarity", "sentiment_neg", "sentiment_neu", "sentiment_pos"], inplace=True)

In [19]:
apple_ds = apple_ds[apple_ds['date'].dt.year <= 2019]

In [20]:
apple_ds = apple_ds.reset_index(drop=True)

In [21]:
apple_ds.head(2)

Unnamed: 0,date,headline,url
0,2019-06-16 23:10:00,MONDAY DEADLINE REMINDER: The Schall Law Firm ...,https://www.globenewswire.com/news-release/201...
1,2019-06-14 19:00:00,"Bronstein, Gewirtz & Grossman, LLC Class Actio...",https://www.globenewswire.com/news-release/201...


In [23]:
apple_ds = apple_ds[["date", "headline"]]

## **Apple Acquisition News Dataset - 3**

[Kaggle Link](https://www.kaggle.com/datasets/joebeachcapital/technology-mergers-and-acquisitions?select=Acquisitions.csv)

In [24]:
acq = pd.read_csv("news/Acquisitions.csv")

In [25]:
acq.head(2)

Unnamed: 0,Acquisitions ID,Acquired Company,Acquiring Company,Year of acquisition announcement,Deal announced on,Price,Status,Terms,Acquisition Profile,News,News Link
0,[24]7 acquired Tellme in 2012,Tellme,[24]7,2012,1/02/2012,Undisclosed amount,Undisclosed,Undisclosed,http://www.crunchbase.com/acquisition/a9e7a5ac...,Microsoft and 24/7 Inc. Join Forces to Deliver...,http://www.microsoft.com/en-us/news/press/2012...
1,3Com acquired Palm in 1997,Palm,3Com,1997,1/06/1997,Undisclosed amount,Undisclosed,Undisclosed,http://www.crunchbase.com/acquisition/65869a9a...,Investors bless 3Com-USR merger - CNET News,http://news.cnet.com/Investors-bless-3Com-USR-...


In [26]:
# Fetching only Apple Data

acq = acq[acq["Acquiring Company"] == "Apple"]

In [27]:
# Fetching only news and date

acq = acq[["Deal announced on", "News", "Acquisitions ID"]]

In [28]:
acq.shape

(51, 3)

In [29]:
acq.isnull().sum()

Deal announced on    0
News                 2
Acquisitions ID      0
dtype: int64

In [30]:
# Filling missing news with headlines

acq["News"] = acq["News"].fillna(acq["Acquisitions ID"])

In [31]:
acq.rename(columns={"Deal announced on" : "date", "News" : "headline"}, inplace=True)

In [32]:
acq = acq[["date", "headline"]]

In [33]:
acq['date'] = pd.to_datetime(acq['date'], format="%d/%m/%Y")

In [34]:
acq.head(2)

Unnamed: 0,date,headline
148,2013-08-28,Apple Acquires Swedish Firm AlgoTrim A Company...
149,2011-12-13,Apple Reportedly Buying Flash Memory Company A...


## **Apple News Dataset 4**

[Kaggle Link](https://www.kaggle.com/datasets/BidecInnovations/stock-price-and-news-realted-to-it?select=AppleNewsStock.csv)

In [35]:
df4 = pd.read_csv("news/AppleNewsStock.csv")

In [36]:
df4.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News
0,2006-12-01,13.114285,13.19,12.871428,91.32,13.045714,198769900,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...
1,2006-12-04,13.125714,13.15,12.928572,91.120003,13.017143,177384200,More on Housing Prices : The broadest governme...


In [37]:
df4 = df4[["Date", "News"]]

In [38]:
df4.rename(columns={"Date" : "date", "News" : "headline"}, inplace=True)

In [39]:
df4.head()

Unnamed: 0,date,headline
0,2006-12-01,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...
1,2006-12-04,More on Housing Prices : The broadest governme...
2,2006-12-05,
3,2006-12-06,Honoring R.W. Apple in Words and Food : About ...
4,2006-12-07,"Homebuilders, and Worries Over Jobs, Lead a De..."


In [40]:
df4.isnull().sum()

date          0
headline    194
dtype: int64

In [41]:
df4.dropna(inplace=True)

In [42]:
df4 = df4.reset_index(drop=True)

df4['date'] = pd.to_datetime(df['date'].reset_index(drop=True))

In [43]:
df4.head()

Unnamed: 0,date,headline
0,2000-12-19,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...
1,2000-10-09,More on Housing Prices : The broadest governme...
2,2000-08-10,Honoring R.W. Apple in Words and Food : About ...
3,2000-07-19,"Homebuilders, and Worries Over Jobs, Lead a De..."
4,2000-06-06,"Homebuilders, and Worries Over Jobs, Lead a De..."


## **Apple News Dataset - 5**

[Kaggle Link](https://www.kaggle.com/datasets/abdulkkhayyum519/nyt-apple-related-news-articles-dataset)

In [44]:
df5 = pd.read_csv("news/nyt_apple_related_news.csv")

In [45]:
df5.head(2)

Unnamed: 0,search_term,pub_date,headline,snippet
0,apple sales spike,2008-01-22,Can the Touch Revive Apple’s iPod Sales?,"Despite a 10 percent drop in Apple’s shares, t..."
1,apple sales spike,2008-07-16,Apple Sues Psystar to Block Macintosh Clone Sales,"Apple has sued Psystar, a Florida-based maker ..."


In [46]:
df5['pub_date'] = pd.to_datetime(df5['pub_date'])

In [47]:
df5.rename(columns={"pub_date" : "date"}, inplace=True)

In [48]:
df5 = df5[["date", "headline"]]

In [49]:
df5.tail(3)

Unnamed: 0,date,headline
1078,2025-01-17,Supreme Court Backs Law Requiring TikTok to Be...
1079,2025-03-21,"‘Severance’ Asks, What if We’re Not Paranoid E..."
1080,2025-04-04,"The Worst Stock Market Drop in Years, and Dr. ..."


> ## **Merging Apple Web and Dataset News**

In [50]:
df = pd.concat([news, apple_ds, acq, df4, df5])

In [51]:
df = df.sort_values(by='date', ascending=True).reset_index(drop=True)

In [52]:
df.shape

(3761, 2)

In [53]:
df.head()

Unnamed: 0,date,headline
0,1988-03-02,Apple acquired Network Innovations
1,1988-06-07,Apple acquired Orion Network Systems
2,1988-06-27,Apple acquired Styleware
3,1988-07-11,Apple acquired Nashoba Systems
4,1989-01-03,Apple acquired Coral Software


In [54]:
df['date'].dt.year.unique()

array([1988., 1989., 1996., 1997., 1999., 2000., 2001., 2002., 2003.,
       2004., 2005., 2006., 2007., 2008., 2009., 2010., 2011., 2012.,
       2013., 2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021.,
       2022., 2023., 2024., 2025.,   nan])

> # **FinBERT Transformer Sentiment Analysis**

In [55]:
# Load Pre-trained FinBERT Model

finbert = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", tokenizer="yiyanghkust/finbert-tone")




Device set to use cuda:0


In [56]:
# Function to get sentiment label
def get_finbert_sentiment(text):
    result = finbert(str(text), truncation=True, max_length=512)[0]  # returns list with dict
    return result['label'], result['score']

In [57]:
df[['sentiment_label', 'news_sentiment']] = df['headline'].apply(lambda x: pd.Series(get_finbert_sentiment(x)))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [59]:
df = df[df['date'].dt.year <= 2019]

In [73]:
df.shape

(746, 4)

In [60]:
df['sentiment_label'].value_counts()

sentiment_label
Neutral     598
Negative     82
Positive     66
Name: count, dtype: int64

In [61]:
df.tail()

Unnamed: 0,date,headline,sentiment_label,news_sentiment
741,2019-12-12,How Much Watching Time Do You Have This Weekend?,Neutral,0.999779
742,2019-12-13,How the Big Apple Circus Ringmaster Spends Her...,Neutral,0.999989
743,2019-12-18,"For Many Campaigns, the Little i’s Have It : L...",Neutral,0.999359
744,2019-12-18,"Amazon, Apple, Google, and the Zigbee Alliance...",Neutral,0.99938
745,2019-12-26,"The Watch Is Smart, but It Can’t Replace Your ...",Neutral,0.974359


In [62]:
df['date'].duplicated().sum()

np.int64(258)

In [71]:
df.to_csv("Datasets/finbert_news_extracted_sentiment.csv", index=False)

In [63]:
daily_avg_news = df.groupby('date')['news_sentiment'].mean().reset_index()

In [64]:
daily_avg_news.rename(columns={'date' : 'Date'}, inplace=True)
daily_avg_news['Date'].duplicated().sum()

np.int64(0)

In [65]:
daily_avg_news.shape

(488, 2)

In [66]:
daily_avg_news.head()

Unnamed: 0,Date,news_sentiment
0,1988-03-02,0.998569
1,1988-06-07,0.999573
2,1988-06-27,0.999008
3,1988-07-11,0.99906
4,1989-01-03,0.998938


In [67]:
# Converting to datetime index

daily_avg_news.set_index(daily_avg_news['Date'], inplace=True)

daily_avg_news.drop(columns=["Date"], inplace=True)

In [68]:
daily_avg_news.tail(2)

Unnamed: 0_level_0,news_sentiment
Date,Unnamed: 1_level_1
2019-12-18,0.99937
2019-12-26,0.974359


In [69]:
daily_avg_news.to_csv("Datasets/finbert_news_score.csv")