In [1]:
import yfinance as yf
from newsapi import NewsApiClient
import pandas as pd
import numpy as np
from datetime import datetime, timezone
import pytz
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch



In [2]:
# Init
newsapi = NewsApiClient(api_key='2308235029104f24b8b883942519253a')
newsdata = newsapi.get_everything(q='microsoft',
                                    #   sources='bbc-news,the-verge',
                                    #   domains='bbc.co.uk,techcrunch.com',
                                      from_param='2025-06-04',
                                      to='2025-07-04',
                                      language='en',
                                    #   sort_by='relevancy',
                                    )

In [3]:
newsdata

{'status': 'ok',
 'totalResults': 9481,
 'articles': [{'source': {'id': 'wired', 'name': 'Wired'},
   'author': 'Kylie Robison',
   'title': 'OpenAI’s Unreleased AGI Paper Could Complicate Microsoft Negotiations',
   'description': 'The partnership between OpenAI and Microsoft in many ways hinges on the definition of artificial general intelligence, creating a tension that has spilled over into OpenAI research that has not been made public.',
   'url': 'https://www.wired.com/story/openai-five-levels-agi-paper-microsoft-negotiations/',
   'urlToImage': 'https://media.wired.com/photos/6837480c7f38ebcc93957426/191:100/w_1280,c_limit/AI-Labs-Promote-Bottom-Line-Business-2213399161.jpg',
   'publishedAt': '2025-06-27T19:35:29Z',
   'content': 'A small clause inside OpenAIs contract with Microsoft, once considered a distant hypothetical, has now become a flashpoint in one of the biggest partnerships in tech.\r\nThe clause states that if OpenA… [+3289 chars]'},
  {'source': {'id': 'the-verge'

In [4]:
msft = pd.read_csv("msftdataHOURLY.csv")
msft.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2025-06-04 13:30:00+00:00,464.920105,465.640015,463.020111,464.0,3207738
1,2025-06-04 14:30:00+00:00,464.631897,465.149994,463.769989,464.920013,1513356
2,2025-06-04 15:30:00+00:00,463.76001,465.089996,463.73999,464.660004,948127
3,2025-06-04 16:30:00+00:00,464.700012,464.820007,463.665008,463.73999,1090878
4,2025-06-04 17:30:00+00:00,464.76001,465.100006,464.170013,464.690002,767012


In [5]:
# Initialize FinBERT for sentiment analysis
def initialize_finbert():
    """Initialize FinBERT model for financial sentiment analysis"""
    model_name = "ProsusAI/finbert"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    # Create sentiment pipeline
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        return_all_scores=True
    )
    
    return sentiment_pipeline

def get_sentiment_score(text, sentiment_pipeline):
    """
    Get sentiment score from FinBERT (-1 to +1 scale)
    FinBERT outputs: positive, negative, neutral
    We'll convert to: positive=+1, negative=-1, neutral=0
    """
    try:
        # Truncate text to avoid token limit issues
        text = text[:512] if len(text) > 512 else text
        
        results = sentiment_pipeline(text)
        
        # FinBERT returns scores for all labels
        sentiment_scores = {result['label'].lower(): result['score'] for result in results[0]}
        
        # Convert to -1 to +1 scale
        if 'positive' in sentiment_scores and 'negative' in sentiment_scores:
            pos_score = sentiment_scores.get('positive', 0)
            neg_score = sentiment_scores.get('negative', 0)
            neutral_score = sentiment_scores.get('neutral', 0)
            
            # Weighted score: positive contributes +1, negative contributes -1
            weighted_score = (pos_score * 1) + (neg_score * -1) + (neutral_score * 0)
            return weighted_score
        else:
            return 0.0
            
    except Exception as e:
        print(f"Error processing sentiment: {e}")
        return 0.0

def process_news_data(newsdata):
    """
    Extract articles from newsdata dict and process them
    """
    articles = []
    
    if 'articles' in newsdata:
        for article in newsdata['articles']:
            # Extract relevant information
            article_data = {
                'title': article.get('title', ''),
                'description': article.get('description', ''),
                'content': article.get('content', ''),
                'publishedAt': article.get('publishedAt', ''),
                'source': article.get('source', {}).get('name', ''),
                'url': article.get('url', '')
            }
            
            # Combine title, description, and content for sentiment analysis
            full_text = f"{article_data['title']} {article_data['description']} {article_data['content']}"
            article_data['full_text'] = full_text.strip()
            
            # Convert publishedAt to datetime
            try:
                article_data['datetime'] = pd.to_datetime(article_data['publishedAt'])
            except:
                article_data['datetime'] = None
                
            articles.append(article_data)
    
    return pd.DataFrame(articles)

def create_hourly_sentiment_data(news_df, msft_datetimes, sentiment_pipeline):
    """
    Create hourly sentiment data aligned with MSFT datetime index
    """
    # Initialize sentiment data with zeros
    sentiment_data = pd.DataFrame({
        'datetime': msft_datetimes,
        'sentiment_score': 0.0
    })
    
    # Process each news article
    for idx, row in news_df.iterrows():
        if row['datetime'] is not None:
            # Get sentiment score
            sentiment_score = get_sentiment_score(row['full_text'], sentiment_pipeline)
            
            # Find the corresponding hour in MSFT data
            article_hour = row['datetime'].floor('H')
            
            # Find matching datetime in sentiment_data
            matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
            
            if not matching_rows.empty:
                # If multiple articles in same hour, average the sentiment
                current_sentiment = sentiment_data.loc[matching_rows.index[0], 'sentiment_score']
                if current_sentiment == 0:
                    sentiment_data.loc[matching_rows.index[0], 'sentiment_score'] = sentiment_score
                else:
                    # Average with existing sentiment
                    sentiment_data.loc[matching_rows.index[0], 'sentiment_score'] = (current_sentiment + sentiment_score) / 2
    
    return sentiment_data

def process_msft_data(msft_df):
    """
    Process MSFT dataframe to have proper datetime and only Close prices
    """
    # Create a copy to avoid modifying original
    msft_processed = msft_df.copy()
    
    # Reset index to make datetime a column if it's currently the index
    if msft_processed.index.name is not None or isinstance(msft_processed.index, pd.DatetimeIndex):
        msft_processed = msft_processed.reset_index()
    
    # Find datetime column (could be index or a column)
    datetime_col = None
    for col in msft_processed.columns:
        if 'datetime' in col.lower() or 'time' in col.lower() or 'date' in col.lower():
            datetime_col = col
            break
    
    if datetime_col is None and msft_processed.index.name:
        # Datetime might be in index
        msft_processed = msft_processed.reset_index()
        datetime_col = msft_processed.columns[0]
    
    # Convert datetime column to proper datetime
    if datetime_col:
        msft_processed['datetime'] = pd.to_datetime(msft_processed[datetime_col])
    else:
        # If no datetime column found, assume index is datetime
        msft_processed['datetime'] = pd.to_datetime(msft_processed.index)
    
    # Keep only datetime and Close columns
    if 'Close' in msft_processed.columns:
        msft_clean = msft_processed[['datetime', 'Close']].copy()
    elif 'PriceClose' in msft_processed.columns:
        msft_clean = msft_processed[['datetime', 'PriceClose']].copy()
        msft_clean.rename(columns={'PriceClose': 'Close'}, inplace=True)
    else:
        # Try to find close price column
        close_cols = [col for col in msft_processed.columns if 'close' in col.lower()]
        if close_cols:
            msft_clean = msft_processed[['datetime', close_cols[0]]].copy()
            msft_clean.rename(columns={close_cols[0]: 'Close'}, inplace=True)
        else:
            raise ValueError("Could not find Close price column in MSFT data")
    
    return msft_clean

def create_final_dataset(newsdata, msft_df):
    """
    Main function to create the final dataset with MSFT prices and sentiment scores
    """
    print("Step 1: Initializing FinBERT model...")
    sentiment_pipeline = initialize_finbert()
    
    print("Step 2: Processing news data...")
    news_df = process_news_data(newsdata)
    print(f"Found {len(news_df)} news articles")
    
    print("Step 3: Processing MSFT data...")
    msft_clean = process_msft_data(msft_df)
    print(f"MSFT data shape: {msft_clean.shape}")
    
    print("Step 4: Creating sentiment data aligned with MSFT timestamps...")
    sentiment_data = create_hourly_sentiment_data(news_df, msft_clean['datetime'], sentiment_pipeline)
    
    print("Step 5: Merging MSFT data with sentiment data...")
    final_df = pd.merge(msft_clean, sentiment_data, on='datetime', how='left')
    
    # Fill any missing sentiment scores with 0
    final_df['sentiment_score'] = final_df['sentiment_score'].fillna(0)
    
    print("Step 6: Final dataset created!")
    print(f"Final dataset shape: {final_df.shape}")
    print(f"Non-zero sentiment scores: {(final_df['sentiment_score'] != 0).sum()}")
    
    return final_df, msft_clean, sentiment_data


In [6]:
final_dataset, msft_processed, sentiment_df = create_final_dataset(newsdata, msft)
print(final_dataset.head(10))
print("\nDataset Info:")
print(final_dataset.info())
print("\nSentiment Score Statistics:")
print(final_dataset['sentiment_score'].describe())


Step 1: Initializing FinBERT model...


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cuda:0
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Step 2: Processing news data...
Found 100 news articles
Step 3: Processing MSFT data...
MSFT data shape: (143, 2)
Step 4: Creating sentiment data aligned with MSFT timestamps...


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_

Step 5: Merging MSFT data with sentiment data...
Step 6: Final dataset created!
Final dataset shape: (143, 3)
Non-zero sentiment scores: 41
                   datetime       Close  sentiment_score
0 2025-06-04 13:30:00+00:00  464.920105         0.000000
1 2025-06-04 14:30:00+00:00  464.631897         0.000000
2 2025-06-04 15:30:00+00:00  463.760010         0.865551
3 2025-06-04 16:30:00+00:00  464.700012         0.000000
4 2025-06-04 17:30:00+00:00  464.760010         0.000000
5 2025-06-04 18:30:00+00:00  464.359985         0.000000
6 2025-06-04 19:30:00+00:00  463.889008         0.000000
7 2025-06-05 13:30:00+00:00  466.404999         0.000000
8 2025-06-05 14:30:00+00:00  467.640015         0.000000
9 2025-06-05 15:30:00+00:00  468.184998         0.000000

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----             

  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]
  article_hour = row['datetime'].floor('H')
  matching_rows = sentiment_data[sentiment_data['datetime'].dt.floor('H') == article_hour]


In [7]:
final_dataset.to_csv("final_dataset_with_sentiment.csv", index=False)