In [4]:
import yfinance as yf
import pandas as pd
import requests
from datetime import datetime, timedelta
import re
from transformers import BertTokenizer, BertForSequenceClassification
import torch


# Data Collection

## Stock Data

In [2]:
# Define your tech stocks list
tech_stocks = ["AAPL", "MSFT", "NVDA", "AMZN", "GOOGL", "META", "TSLA", "AMD", "NFLX", "AVGO"]

# Function to fetch historical stock data
def fetch_stock_data(stocks, period="1y"):
    stock_data = {}
    for stock in stocks:
        df = yf.download(stock, period=period, interval="1d")  # Daily data
        stock_data[stock] = df
        print(f"Downloaded {stock} data.")
    return stock_data

# Fetch stock data
stock_data = fetch_stock_data(tech_stocks)

# Save each stock's data as CSV
for stock, df in stock_data.items():
    df.to_csv(f"{stock}_historical_data.csv")

print("Stock data collection complete and saved to CSV files.")


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded AAPL data.
Downloaded MSFT data.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded NVDA data.
Downloaded AMZN data.
Downloaded GOOGL data.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded META data.
Downloaded TSLA data.
Downloaded AMD data.



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded NFLX data.
Downloaded AVGO data.
Stock data collection complete and saved to CSV files.





## Check Missing Value

In [3]:
df = pd.read_csv("AAPL_historical_data.csv")  # Replace with any stock CSV file
print(df.info())  # Check data types and missing values
print(df.head())  # Preview first few rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   252 non-null    object
 1   Close   251 non-null    object
 2   High    251 non-null    object
 3   Low     251 non-null    object
 4   Open    251 non-null    object
 5   Volume  251 non-null    object
dtypes: object(6)
memory usage: 11.9+ KB
None
        Price               Close                High                 Low  \
0      Ticker                AAPL                AAPL                AAPL   
1        Date                 NaN                 NaN                 NaN   
2  2024-02-20   180.7067108154297  181.57261717237213   179.1540448566047   
3  2024-02-21   181.4631805419922  182.03049388623003  179.81097819372195   
4  2024-02-22  183.50350952148438  184.09074823448796  181.60249751065342   

                 Open    Volume  
0                AAPL      AAPL  
1                 NaN       NaN

## News Data

In [5]:
# Your NewsAPI Key
api_key = "31dbe344e851496e950dc899ab1d0e93"

# List of tech stocks to search for
tech_stocks = ["Apple", "Microsoft", "Nvidia", "Amazon", "Google", "Meta", "Tesla", "AMD", "Netflix", "Broadcom"]

# Date range for news (last 7 days)
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d')

# Function to fetch news for each stock
def fetch_news(stock):
    url = f"https://newsapi.org/v2/everything?q={stock}&language=en&from={start_date}&to={end_date}&sortBy=publishedAt&apiKey={api_key}"
    response = requests.get(url)
    return response.json()

# Collect news for all stocks
news_data = []
for stock in tech_stocks:
    data = fetch_news(stock)
    if "articles" in data:
        for article in data["articles"]:
            news_data.append({
                "Stock": stock,
                "Title": article["title"],
                "Source": article["source"]["name"],
                "Published At": article["publishedAt"],
                "URL": article["url"]
            })

# Convert to DataFrame
df_news = pd.DataFrame(news_data)

# Save to CSV
df_news.to_csv("tech_stock_news.csv", index=False)

print("News data collected and saved to 'tech_stock_news.csv'.")


News data collected and saved to 'tech_stock_news.csv'.


In [6]:
# Load the collected news data
df_news = pd.read_csv("tech_stock_news.csv")

# Display first few rows
print(df_news.head())


   Stock                                              Title          Source  \
0  Apple  Tudor Financial Inc. Grows Stock Holdings in A...  ETF Daily News   
1  Apple  iPhone SE 4 live: latest news and rumors ahead...    Slashdot.org   
2  Apple  Apple’s February 2025 Event. 6 Biggest Predict...   Geeky Gadgets   
3  Apple             How Eric Adams Got His Charges Dropped  Slate Magazine   
4  Apple  Apple’s political turn is leading it down a sk...        Macworld   

           Published At                                                URL  
0  2025-02-18T10:32:53Z  https://www.etfdailynews.com/2025/02/18/tudor-...  
1  2025-02-18T10:32:40Z  https://slashdot.org/firehose.pl?op=view&amp;i...  
2  2025-02-18T10:30:56Z  https://www.geeky-gadgets.com/apples-february-...  
3  2025-02-18T10:30:00Z  https://slate.com/podcasts/what-next/2025/02/w...  
4  2025-02-18T10:30:00Z  https://www.macworld.com/article/2611168/apple...  


In [7]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply cleaning
df_news["Cleaned_Title"] = df_news["Title"].apply(clean_text)

# Save cleaned data
df_news.to_csv("cleaned_tech_stock_news.csv", index=False)

print("Data cleaned and saved.")


Data cleaned and saved.


# Sentiment Analysis

## FinBERT model

In [1]:
# Load FinBERT tokenizer & model
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")


In [2]:
import torch.nn.functional as F

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)

    # Labels: 0 = Negative, 1 = Neutral, 2 = Positive
    labels = ["Negative", "Neutral", "Positive"]
    sentiment = labels[torch.argmax(probs).item()]
    
    return sentiment

# Test on an example headline
example_headline = "Apple stock soars as earnings beat expectations"
print(f"Sentiment: {predict_sentiment(example_headline)}")


Sentiment: Neutral


In [5]:
df_news = pd.read_csv("cleaned_tech_stock_news.csv")

# Apply sentiment analysis to each headline
df_news["Sentiment"] = df_news["Cleaned_Title"].apply(predict_sentiment)

# Save results
df_news.to_csv("news_sentiment_analysis.csv", index=False)

print("Sentiment analysis complete. Results saved.")


Sentiment analysis complete. Results saved.


In [6]:
# Load the dataset with sentiment analysis results
df_news = pd.read_csv("news_sentiment_analysis.csv")

# Count the occurrences of each sentiment category
sentiment_counts = df_news["Sentiment"].value_counts()

# Display results
print(sentiment_counts)


Sentiment
Negative    773
Positive    105
Neutral      91
Name: count, dtype: int64


# Chart Analysis

In [1]:
import pandas as pd
import mplfinance as mpf

# Load stock price data (Replace with actual filename)
df = pd.read_csv("AAPL_historical_data.csv", parse_dates=["Date"], index_col="Date")

# Plot candlestick chart
mpf.plot(df, type="candle", style="charles", volume=True, title="AAPL Candlestick Chart")


ValueError: Missing column provided to 'parse_dates': 'Date'