In [None]:
# Import libraries
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
# # Read News and DJIA Data
# df_news = pd.read_csv('/kaggle/input/financialnews/Combined_News_DJIA.csv')
# df_stock = pd.read_csv('/kaggle/input/financialnews/upload_DJIA_table.csv')
df_news = pd.read_csv('/kaggle/input/masterthesis-00-datagathering/SP500_all.csv')

In [None]:
# check data
display(df_news.info())
# print('______________________________________________')
# display(df_stock.info())

In [None]:
# df_news.drop('Unnamed: 0', axis=1, inplace=True)
# df_news.drop('Unnamed: 0', axis=1, inplace=True)
# df_news.drop('text', axis=1, inplace=True)

df_news = df_news[['date','headline']]

In [None]:
# Adjust display settings

pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Get 5 random samples with reproducibility
display(df_news.sample(n=5, random_state=42))

In [None]:
df_news.fillna('', inplace=True)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
import re
import html

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the stop words
stop_words = set(stopwords.words('english'))

# Define a function to preprocess the headlines
def preprocess_text(text, mode="simple"):
    # Convert the text to lower case
    #text = text.lower()
    
    # Remove html entities
    text = html.unescape(text)
    
    # Remove byte-string prefix and content
    text = re.sub(r"b'", "", text)
    text = re.sub(r'b"', "", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\\", "", text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove numeric values
    text = re.sub(r'\d+', '', text)
    
    if mode == "advanced":   
    
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize the text
        text = text.split()

        # Remove stop words and lemmatize the words
        text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
        
        # Join the words back into a single string
        text = ' '.join(text)
        
    else:
        
        # Join the words back into a single string
        text = ''.join(text)
    
    return text

In [None]:
# # New columns for preprocessed headlines
# new_cols_simple = [f'Top{i}_Preprocessed' for i in range(1, 26)]
# # new_cols_advanced = [f'Top{i}_Preprocessed_Advanced' for i in range(1, 26)]

# # Apply the function to each headline
# for i in range(1, 26):
#     col_name = f'Top{i}'
#     new_col_simple = f'Top{i}_Preprocessed'
# #     new_col_advanced = f'Top{i}_Preprocessed_Advanced'
    
#     df_news[new_col_simple] = df_news[col_name].apply(preprocess_text, mode="simple")
# #     df_news[new_col_advanced] = df_news[col_name].apply(preprocess_text, mode="advanced")

df_news['headline_preprocessed'] = df_news['headline'].apply(preprocess_text, mode="simple")

In [None]:
df_news.info()

In [None]:
# from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import pipeline

# finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
# tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer, top_k = 1)

# sentences = df_news['headline_preprocessed'][:10].tolist()
# results = nlp(sentences)
# print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

# sentences = df_news['headline'][:10].tolist()
# results = nlp(sentences)
# print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

# Load pre-trained model and tokenizer from Hugging Face
model_name = 'yiyanghkust/finbert-tone'  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_sentiment_finbert(data, batch_size=32):
    """
    Predicts the sentiment of text data in a DataFrame using FinBERT, with batching.
    
    Args:
        data (pd.Series): Series containing the text data.
        batch_size (int): The number of samples to process in each batch.
        
    Returns:
        pd.DataFrame: A DataFrame with sentiment scores and labels.
    """
    scores = []
    labels = []
    prob_score = []

    # Tokenize text and convert to input format for BERT
    tokenized_output = tokenizer(data.tolist(), 
                                 padding=True, 
                                 truncation=True, 
                                 return_tensors="pt", 
                                 max_length=512)
    input_ids = tokenized_output['input_ids']
    attention_mask = tokenized_output['attention_mask']
    
    # Move to device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Process text in batches
    for i in range(0, len(input_ids), batch_size):
        inputs = input_ids[i : i + batch_size]

        # Model inference
        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_mask[i : i + batch_size])
        
        # Get probabilities
        probs = softmax(outputs.logits, dim=1).cpu().numpy()
        
        # Convert to a score between -1 and 1
        sentiment_scores = probs[:, 1] - probs[:, 2]  # LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative
        
        # Get the sentiment label (assuming LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative)
        sentiment_labels = ['Neutral', 'Positive', 'Negative']
        sentiment_label_indices = probs.argmax(axis=1)
        batch_labels = [sentiment_labels[idx] for idx in sentiment_label_indices]
        
        scores.extend(sentiment_scores)
        labels.extend(batch_labels)
        prob_score.extend(probs)

    results = pd.DataFrame({
        'headline': data,
        'finbert_score': scores, 
        'label': labels,
        'probs': prob_score
    }, index=data.index)

    return results

df_results = predict_sentiment_finbert(df_news['headline'])

In [None]:
df_news = pd.concat([df_news, df_results['finbert_score']], axis=1)

In [None]:
df_news.head(10)

In [None]:
# import torch
# import pandas as pd
# from transformers import BertTokenizer, BertForSequenceClassification
# from torch.nn.functional import softmax

# # Load pre-trained model and tokenizer from Hugging Face
# model_name = 'yiyanghkust/finbert-tone'  # Example FinBERT model, adjust as needed
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name)

# # Use GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# def predict_sentiment_finbert(data, batch_size=32):
#     """
#     Predicts the sentiment of text data in a DataFrame using FinBERT, with batching.
    
#     Args:
#         data (pd.Series): Series containing the text data.
#         batch_size (int): The number of samples to process in each batch.
        
#     Returns:
#         pd.Series: A series of sentiment scores between -1 and 1.
#     """
#     scores = []

#     # Tokenize text and convert to input format for BERT
#     input_ids = tokenizer(data.tolist(), 
#                           padding=True, 
#                           truncation=True, 
#                           return_tensors="pt", 
#                           max_length=512)['input_ids']
    
#     # Move to device
#     input_ids = input_ids.to(device)

#     # Process text in batches
#     for i in range(0, len(input_ids), batch_size):
#         inputs = input_ids[i : i + batch_size]

#         # Model inference
#         with torch.no_grad():
#             outputs = model(inputs)
        
#         # Get probabilities
#         probs = softmax(outputs.logits, dim=1).cpu().numpy()

#         # Convert to a score between -1 and 1
#         sentiment_scores = probs[:, 2] - probs[:, 0]  # Assuming 0: negative, 1: neutral, 2: positive
        
#         scores.extend(sentiment_scores)

#     return pd.Series(scores, index=data.index)


# # Call the function and add the resulting scores to df_news
# df_news['finbert_score'] = predict_sentiment_finbert(df_news['headline'])

In [None]:
pip install vaderSentiment

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def predict_sentiment_vader(dataframe, text_column):
    """
    Predicts the sentiment of text data in a DataFrame using VADER.
    
    Args:
        dataframe (pd.DataFrame): DataFrame containing the text data.
        text_column (str): The name of the column containing the text data.
        
    Returns:
        pd.Series: A series of sentiment scores between -1 and 1.
    """
    analyser = SentimentIntensityAnalyzer()
    
    # Compute sentiment scores
    scores = dataframe[text_column].apply(lambda x: analyser.polarity_scores(x)['compound'])
    
    return scores


# Call the function and add the resulting scores to df_news
df_news['vader_score'] = predict_sentiment_vader(df_news, 'headline_preprocessed')


In [None]:
df_news

In [None]:
df_news.to_csv("sentiment.csv")

In [None]:
df_stock = pd.read_csv('/kaggle/input/masterthesis-00-datagathering/SP500_stock.csv')
df_stock.drop('Unnamed: 0', axis=1, inplace=True)

# Calculate the average sentiment per day
df_news = df_news.groupby('date')[['finbert_score', 'vader_score']].mean()
df = df_news.merge(df_stock, on= "date")

df.to_csv("stock_sentiment.csv")