In [5]:
import yfinance as yf
import pandas as pd
from transformers import pipeline
import matplotlib.pyplot as plt

# Step 1: Fetch Stock Data (e.g., Apple)
symbol = 'AAPL'
data = yf.download(symbol, start="2022-01-01", end="2024-01-01")

# Step 2: Load Hugging Face Sentiment Analysis Pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Step 3: Example Financial News Headlines (Replace with actual API calls or datasets)
# Here we are simulating the news headlines
headlines = [
    "Apple stock rises as quarterly earnings beat estimates",
    "Apple launches new iPhone model, boosting investor confidence",
    "Apple faces decline in quarterly revenue amid global chip shortage",
    "Apple stock drops as market reacts to CEO's controversial comments",
    "Apple sees strong growth in services, offsetting hardware sales decline"
]

# Step 4: Analyze Sentiment of the News Headlines
sentiments = [sentiment_analyzer(headline)[0]['label'] for headline in headlines]
sentiments

[*********************100%***********************]  1 of 1 completed
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE']

In [None]:
# !pip install torch


<!-- !pip install tensorflow -->


In [7]:

# Map sentiment labels to numerical values (Positive: 1, Negative: -1, Neutral: 0)
sentiment_map = {'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0}
sentiment_scores = [sentiment_map[sentiment] for sentiment in sentiments]

# Step 5: Simulate Sentiment-Based Stock Trading Strategy
# Here, we'll use sentiment as a simple indicator to buy (1), sell (-1), or hold (0)
# For simplicity, we assume the sentiment on the 5th day after the news impact stock price.

data['Sentiment'] = pd.Series(sentiment_scores * (len(data) // len(sentiment_scores)), index=data.index[:len(data)])

# Step 6: Apply a Moving Average (SMA) strategy to simulate a quant strategy
data['SMA50'] = data['Close'].rolling(window=50).mean()
data['SMA200'] = data['Close'].rolling(window=200).mean()

# Define Buy (1) or Sell (-1) based on the crossing of SMA50 and SMA200
data['Signal'] = 0
data.loc[data['SMA50'] > data['SMA200'], 'Signal'] = 1  # Buy signal
data.loc[data['SMA50'] < data['SMA200'], 'Signal'] = -1  # Sell signal

# Step 7: Combine Sentiment and SMA Signal for trading strategy
data['Strategy Signal'] = data['Signal'] * data['Sentiment']

# Step 8: Calculate the daily returns based on the strategy signals
data['Daily Return'] = data['Close'].pct_change()
data['Strategy Return'] = data['Strategy Signal'].shift(1) * data['Daily Return']  # Shift the signal

# Step 9: Plot the Cumulative Returns for Stock vs Strategy
data['Cumulative Stock Return'] = (1 + data['Daily Return']).cumprod()
data['Cumulative Strategy Return'] = (1 + data['Strategy Return']).cumprod()

plt.figure(figsize=(12, 6))
plt.plot(data['Cumulative Stock Return'], label='Stock Cumulative Return', color='blue')
plt.plot(data['Cumulative Strategy Return'], label='Strategy Cumulative Return', color='red')
plt.title(f'{symbol} Cumulative Returns: Stock vs Strategy with Sentiment and SMA')
plt.xlabel('Date')
plt.ylabel('Cumulative Return')
plt.legend()
plt.show()

# Print final cumulative returns
print(f"Final Stock Cumulative Return: {data['Cumulative Stock Return'].iloc[-1]:.2f}")
print(f"Final Strategy Cumulative Return: {data['Cumulative Strategy Return'].iloc[-1]:.2f}")


ValueError: Length of values (500) does not match length of index (501)

In [12]:
from transformers import BertTokenizer, BertModel
import torch
import faiss
import numpy as np

# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a text
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of all token embeddings (pooling)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Example documents to index
documents = [
    "Artificial intelligence is the new electricity.",
    "Machine learning is a subset of AI.",
    "BERT is a transformer-based model for NLP tasks.",
    "Deep learning powers most AI applications today."
]

# Step 1: Get embeddings for the documents
doc_embeddings = np.array([get_bert_embeddings(doc) for doc in documents])

# Check the dimensions of doc_embeddings to ensure the length matches
print("Document embeddings shape:", doc_embeddings.shape)

# Ensure there are no discrepancies in the embeddings length
if len(doc_embeddings) != len(documents):
    raise ValueError(f"Embedding length mismatch: {len(doc_embeddings)} embeddings, but {len(documents)} documents.")

# Step 2: Index the embeddings using FAISS
index = faiss.IndexFlatL2(doc_embeddings.shape[1])  # Create the FAISS index
index.add(doc_embeddings)  # Add embeddings to the index

# Step 3: Query processing
query = "What is AI?"

# Get the query embedding
query_embedding = get_bert_embeddings(query).reshape(1, -1)

# Step 4: Perform similarity search
k = 3  # Number of results to retrieve
D, I = index.search(query_embedding, k)  # D is the distances, I is the indices of the closest documents

# Step 5: Display the results
print("Query:", query)
print("Top 3 most relevant documents:")
for i in I[0]:
    print(f"- {documents[i]} (Similarity score: {D[0][i]:.4f})")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Document embeddings shape: (4, 768)
Query: What is AI?
Top 3 most relevant documents:
- Artificial intelligence is the new electricity. (Similarity score: 62.5785)


IndexError: index 3 is out of bounds for axis 0 with size 3