In [52]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import download
import re
from collections import defaultdict
from datetime import datetime
download('vader_lexicon')  # Download the VADER lexicon for sentiment analysis

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


# Initialize the offset
start_date = datetime.strptime('2022-11-21', '%Y-%m-%d')
end_date = datetime.strptime('2023-01-08', '%Y-%m-%d')
offset = 500
def analyze_sentiment(link):
    driver.get(link)
    article_html = driver.page_source
    soup = BeautifulSoup(article_html, 'html.parser')
    
    # Find all paragraphs in the article.
    paragraphs = soup.find_all('p', attrs={"data-testid": re.compile(r"paragraph-\d+")})
    article_text = " ".join([para.get_text() for para in paragraphs])
    # Get the sentiment score for the article text.
    sentiment_score = sia.polarity_scores(article_text)
    return sentiment_score
def average_sentiment(sentiments):
    averages = {k: sum(v)/len(v) if v else 0 for k, v in sentiments.items()}
    return averages
sentiment_by_date = defaultdict(lambda: {'neg': [], 'neu': [], 'pos': []})
while True:
    # Construct the URL with the current offset
    driver = webdriver.Safari()
    url = f'https://www.reuters.com/site-search/?query=tech+news&date=any_time&offset={offset}'
    
    # Navigate to the page
    driver.get(url)
    
    # Now parse the page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all the news items
    news_items = soup.find_all('div', attrs={"data-testid": "MediaStoryCard"})
    
    # Check if news_items is empty, if so, break the loop
    if not news_items:
        print(url)
        print("No more articles found. Ending scrape.")
        break
    for item in news_items:
        # Extract the news item's title, link, and date
        title_tag = item.find('a', attrs={"data-testid": "Link"})
        date_tag = item.find('time', attrs={"data-testid": "Body"})
        date = date_tag.get_text(strip=True) if date_tag else 'No Date'
        article_date = datetime.strptime(date, '%B %d, %Y')
        if article_date<=end_date:
            title = title_tag.get_text(strip=True) if title_tag else 'No Title'
            link = title_tag['href'] if title_tag else 'No Link'
            sentiment = analyze_sentiment('https://www.reuters.com'+link)

            for key in ['neg', 'neu', 'pos']:
                sentiment_by_date[date][key].append(sentiment[key])
            print(date)
            print(sentiment)
        if article_date<=start_date:
            break
    offset+=20
    driver.quit()
    if article_date<=start_date:
        break


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zengboyuan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


January 8, 2023
{'neg': 0.094, 'neu': 0.814, 'pos': 0.092, 'compound': 0.2617}
January 8, 2023
{'neg': 0.039, 'neu': 0.889, 'pos': 0.073, 'compound': 0.9443}
January 6, 2023
{'neg': 0.051, 'neu': 0.803, 'pos': 0.146, 'compound': 0.9995}
January 5, 2023
{'neg': 0.053, 'neu': 0.88, 'pos': 0.067, 'compound': -0.2382}
January 5, 2023
{'neg': 0.053, 'neu': 0.816, 'pos': 0.131, 'compound': 0.9912}
January 5, 2023
{'neg': 0.074, 'neu': 0.871, 'pos': 0.055, 'compound': -0.8554}
January 5, 2023
{'neg': 0.095, 'neu': 0.852, 'pos': 0.052, 'compound': -0.875}
December 29, 2022
{'neg': 0.101, 'neu': 0.777, 'pos': 0.122, 'compound': 0.8097}
December 28, 2022
{'neg': 0.068, 'neu': 0.826, 'pos': 0.106, 'compound': 0.9409}
December 23, 2022
{'neg': 0.074, 'neu': 0.819, 'pos': 0.107, 'compound': 0.9737}
December 22, 2022
{'neg': 0.049, 'neu': 0.882, 'pos': 0.07, 'compound': 0.8519}
December 22, 2022
{'neg': 0.061, 'neu': 0.911, 'pos': 0.028, 'compound': -0.9849}
December 21, 2022
{'neg': 0.02, 'neu': 0.

In [53]:
sentiment_by_date

defaultdict(<function __main__.<lambda>()>,
            {'January 8, 2023': {'neg': [0.094, 0.039],
              'neu': [0.814, 0.889],
              'pos': [0.092, 0.073]},
             'January 6, 2023': {'neg': [0.051],
              'neu': [0.803],
              'pos': [0.146]},
             'January 5, 2023': {'neg': [0.053, 0.053, 0.074, 0.095],
              'neu': [0.88, 0.816, 0.871, 0.852],
              'pos': [0.067, 0.131, 0.055, 0.052]},
             'December 29, 2022': {'neg': [0.101],
              'neu': [0.777],
              'pos': [0.122]},
             'December 28, 2022': {'neg': [0.068],
              'neu': [0.826],
              'pos': [0.106]},
             'December 23, 2022': {'neg': [0.074],
              'neu': [0.819],
              'pos': [0.107]},
             'December 22, 2022': {'neg': [0.049, 0.061],
              'neu': [0.882, 0.911],
              'pos': [0.07, 0.028]},
             'December 21, 2022': {'neg': [0.02, 0.094, 0.08],
            

In [49]:
first_half

defaultdict(<function __main__.<lambda>()>,
            {'November 8, 2023': {'neg': [0.084,
               0.06,
               0.0,
               0.034,
               0.009,
               0.012,
               0.032],
              'neu': [0.827, 0.802, 0.919, 0.846, 0.939, 0.915, 0.883],
              'pos': [0.089, 0.138, 0.081, 0.12, 0.052, 0.073, 0.086]},
             'November 7, 2023': {'neg': [0.006],
              'neu': [0.885],
              'pos': [0.109]},
             'November 6, 2023': {'neg': [0.099, 0.025],
              'neu': [0.786, 0.869],
              'pos': [0.115, 0.105]},
             'November 3, 2023': {'neg': [0.052, 0.07],
              'neu': [0.783, 0.912],
              'pos': [0.165, 0.018]},
             'November 2, 2023': {'neg': [0.066, 0.056],
              'neu': [0.866, 0.881],
              'pos': [0.068, 0.063]},
             'November 1, 2023': {'neg': [0.094, 0.0, 0.117, 0.0],
              'neu': [0.865, 0.928, 0.827, 0.936],
         

In [60]:
import pandas as pd
dic = {'date': [], 'neg':[], 'neu':[], 'pos':[]}
for date, sentiments in sentiment_by_date.items():
    averages = average_sentiment(sentiments)
    dic['date'].append(date)
    dic['neg'].append(averages['neg'])
    dic['neu'].append(averages['neu'])
    dic['pos'].append(averages['pos'])
df = pd.DataFrame(dic)
print(df)

                 date       neg       neu       pos
0     January 8, 2023  0.066500  0.851500  0.082500
1     January 6, 2023  0.051000  0.803000  0.146000
2     January 5, 2023  0.068750  0.854750  0.076250
3   December 29, 2022  0.101000  0.777000  0.122000
4   December 28, 2022  0.068000  0.826000  0.106000
5   December 23, 2022  0.074000  0.819000  0.107000
6   December 22, 2022  0.055000  0.896500  0.049000
7   December 21, 2022  0.064667  0.868000  0.067333
8   December 19, 2022  0.079000  0.827000  0.095000
9   December 18, 2022  0.040000  0.903000  0.056000
10  December 16, 2022  0.099000  0.823000  0.078000
11  December 15, 2022  0.049667  0.844000  0.106000
12  December 14, 2022  0.024000  0.876500  0.099000
13  December 13, 2022  0.030750  0.902500  0.066750
14  December 12, 2022  0.036200  0.891200  0.072400
15  December 10, 2022  0.020000  0.905000  0.075000
16   December 9, 2022  0.020000  0.900000  0.080000
17   December 8, 2022  0.045000  0.877000  0.078250
18   Decembe

In [61]:
dic = {'date': [], 'neg':[], 'neu':[], 'pos':[]}
for date, sentiments in first_half.items():
    averages = average_sentiment(sentiments)
    dic['date'].append(date)
    dic['neg'].append(averages['neg'])
    dic['neu'].append(averages['neu'])
    dic['pos'].append(averages['pos'])
df1 = pd.DataFrame(dic)
print(df1)

                 date      neg       neu       pos
0    November 8, 2023  0.03300  0.875857  0.091286
1    November 7, 2023  0.00600  0.885000  0.109000
2    November 6, 2023  0.06200  0.827500  0.110000
3    November 3, 2023  0.06100  0.847500  0.091500
4    November 2, 2023  0.06100  0.873500  0.065500
..                ...      ...       ...       ...
218  January 13, 2023  0.03540  0.866600  0.098000
219  January 12, 2023  0.05725  0.838000  0.104750
220  January 10, 2023  0.04075  0.876250  0.083000
221   January 9, 2023  0.02500  0.913000  0.062000
222   January 8, 2023  0.09400  0.814000  0.092000

[223 rows x 4 columns]


In [66]:
final_df = df1[:222].append(df)
final_df.to_csv()

  final_df = df1[:222].append(df)


',date,neg,neu,pos\n0,"November 8, 2023",0.03300000000000001,0.8758571428571429,0.09128571428571428\n1,"November 7, 2023",0.006,0.885,0.109\n2,"November 6, 2023",0.062,0.8275,0.11\n3,"November 3, 2023",0.061,0.8475,0.0915\n4,"November 2, 2023",0.061,0.8734999999999999,0.0655\n5,"November 1, 2023",0.052750000000000005,0.889,0.058249999999999996\n6,"October 30, 2023",0.059,0.902,0.039\n7,"October 29, 2023",0.036,0.851,0.113\n8,"October 27, 2023",0.05875,0.8280000000000001,0.113\n9,"October 26, 2023",0.041333333333333326,0.8700000000000001,0.08866666666666667\n10,"October 25, 2023",0.0865,0.813,0.101\n11,"October 24, 2023",0.05600000000000001,0.85275,0.09125\n12,"October 23, 2023",0.01,0.86,0.129\n13,"October 20, 2023",0.034,0.934,0.031\n14,"October 19, 2023",0.0545,0.8545,0.0905\n15,"October 18, 2023",0.044,0.89,0.066\n16,"October 17, 2023",0.114,0.758,0.127\n17,"October 16, 2023",0.025,0.872,0.103\n18,"October 13, 2023",0.058,0.87275,0.069\n19,"October 12, 2023",0.11850000000000001,0.79