# Part 1-1: FinBERT Sentiment Analysis

In [None]:
import pandas as pd

# Load the CSV file
file_path = './data/original_datasets/NEWS_YAHOO_stock_prediction.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()

In [None]:
data[['Date', 'category', 'content']].head(10)

In [None]:
# Step 1: Remove unnecessary column
data.drop(columns=['Unnamed: 0'], inplace=True)

# Step 2: Remove duplicate texts
data.drop_duplicates(subset=['title', 'content'], inplace=True)

# Step 3: Remove rows with large amount of spaces or empty texts in 'title' and 'content'
data = data[~data['title'].str.isspace()]
data = data[~data['content'].str.isspace()]
data.dropna(subset=['title', 'content'], inplace=True)

# Check the dataframe after these preprocessing steps
data.info()

# Step 5: Check for invalid numeric data
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
data[numeric_columns].describe()


In [None]:
# (optional) set proxy
import subprocess
import os

result = subprocess.run('bash -c "source ~/clash_dir/set && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
output
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

## Method 1: Truncation

In [None]:

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the FinBERT model and tokenizer
checkpoint = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Create a pipeline for sentiment analysis
# truncate first 512 tokens
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, max_length=512, truncation=True, device=0)

In [None]:
# Function to apply sentiment analysis to a dataframe
def apply_sentiment_analysis(df, nlp, text_column='content'):
    """
    Apply sentiment analysis to a column in a dataframe.
    
    Args:
    df (pd.DataFrame): Dataframe containing the text data.
    nlp (pipeline): HuggingFace pipeline for sentiment analysis.
    text_column (str): Name of the column containing text data.

    Returns:
    pd.DataFrame: Dataframe with a new column 'sentiment' containing the analysis results.
    """
    # Apply sentiment analysis to each row in the text column
    sentiments = []
    for text in df[text_column]:
        try:
            result = nlp(text)
            sentiments.append(result[0]['label'])
        except Exception as e:
            print(f"Error in processing text: {e}")
            sentiments.append('Error')

    # Add the sentiments as a new column in the dataframe
    df['sentiment'] = sentiments
    return df


from tqdm.auto import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def apply_sentiment_analysis_parallel(df, nlp, text_column='content', batch_size=10):
    """
    Apply sentiment analysis in parallel to a column in a dataframe.

    Args:
    df (pd.DataFrame): Dataframe containing the text data.
    nlp (pipeline): HuggingFace pipeline for sentiment analysis.
    text_column (str): Name of the column containing text data.
    batch_size (int): Number of texts to process in parallel.

    Returns:
    pd.DataFrame: Dataframe with a new column 'sentiment' containing the analysis results.
    """
    # Define a function to process a batch of texts
    def process_batch(texts):
        return [nlp(text)[0]['label'] for text in texts]

    # Break the texts into batches
    batches = [df[text_column][i:i + batch_size] for i in range(0, len(df), batch_size)]

    # Process batches in parallel
    sentiments = []
    with ThreadPoolExecutor() as executor:
        for batch_result in tqdm(executor.map(process_batch, batches), total=len(batches)):
            sentiments.extend(batch_result)

    # Add the sentiments as a new column in the dataframe
    df['sentiment'] = sentiments
    return df

In [None]:
# Step 1: Apply sentiment analysis to the dataset
data = apply_sentiment_analysis_parallel(data, nlp)

# Step 2: Prepare data for the prediction model
# We might want to convert sentiments to numerical values for model training
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
data['sentiment_numeric'] = data['sentiment'].map(sentiment_mapping)

In [None]:
# Example code to save the processed DataFrame to a CSV file
data.to_csv('./data/FinBERT_sentiment/dataset_with_sentiment.csv', index=False)

## Method 2: Sliding Window

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the FinBERT model and tokenizer
checkpoint = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Create a pipeline for sentiment analysis
# Do not truncate the original text
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)

In [None]:
# 滑动窗口方法
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
# from tqdm import tqdm

# 根据token数量来切分
def sliding_window(text, max_len, overlap, tokenizer):
    """
    Args:
    text (str): the text to be split into chunks
    max_len (int): the maximum length of each chunk
    overlap (int): the number of overlapped tokens between chunks
    tokenizer: the tokenizer used to tokenize the text

    Returns:
    list of str: the list of text chunks
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_len - overlap):
        chunk = tokens[i:i + max_len]
        chunk = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk)
    return chunks

def process_batch(texts, nlp, max_len, overlap, tokenizer):
    sentiments = []
    for text in texts:
        # Apply sliding window to the text
        text_chunks = sliding_window(text, max_len, overlap, tokenizer)

        # Apply sentiment analysis to each chunk
        chunk_sentiments = []
        chunk_weights = []
        for chunk in text_chunks:
            result = nlp(chunk)
            sentiment = result[0]['label']
            chunk_sentiments.append(sentiment)
            # Use the length of the chunk as the weight
            weight = len(chunk)
            chunk_weights.append(weight)

        # Combine the sentiments using weighted voting
        sentiment_counter = Counter()
        for sentiment, weight in zip(chunk_sentiments, chunk_weights):
            sentiment_counter[sentiment] += weight
        final_sentiment = sentiment_counter.most_common(1)[0][0]
        sentiments.append(final_sentiment)
    return sentiments


def apply_sentiment_analysis_parallel(df, nlp, tokenizer, text_column='content', max_len=500, overlap=50, num_workers=16, batch_size=10):
    """
    Apply sentiment analysis to a column in a dataframe using sliding window method.
    
    Args:
    df (pd.DataFrame): Dataframe containing the text data.
    nlp (pipeline): HuggingFace pipeline for sentiment analysis.
    tokenizer: the tokenizer used to tokenize the text
    text_column (str): Name of the column containing text data.
    max_len (int): The maximum length of each text chunk.
    overlap (int): The number of overlapped tokens between chunks.
    num_workers (int): The number of threads to use for parallel processing.
    batch_size (int): The number of texts to process in each batch.

    Returns:
    pd.DataFrame: Dataframe with a new column 'sentiment' containing the analysis results.
    """
    # Break the texts into batches
    text_batches = [df[text_column][i:i + batch_size] for i in range(0, len(df), batch_size)]

    sentiments = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        for batch_result in tqdm(executor.map(lambda batch: process_batch(batch, nlp, max_len, overlap, tokenizer), text_batches), total=len(text_batches)):
            sentiments.extend(batch_result)

    # Add the sentiments as a new column in the dataframe
    df['sentiment'] = sentiments
    return df

In [None]:
# Step 1: Apply sentiment analysis to the dataset
data = apply_sentiment_analysis_parallel(data, nlp, tokenizer)

# Step 2: Prepare data for the prediction model
# We might want to convert sentiments to numerical values for model training
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
data['sentiment_numeric'] = data['sentiment'].map(sentiment_mapping)

In [None]:
# Example code to save the processed DataFrame to a CSV file
data.to_csv('./data/FinBERT_sentiment/dataset_sliding_window.csv', index=False)

**Group by days**
---

In [None]:
# Adjust display settings for better visualization of samples
pd.set_option('display.max_colwidth', 200)  # Adjust the width to fit longer texts

# Display some random samples with formatted output
sample_data = data.sample(n=10)[['content', 'sentiment']]

# Print each sample in a more readable format
for index, row in sample_data.iterrows():
    print(f"Sample {index}:")
    print(f"Content: {row['content']}")
    print(f"Sentiment: {row['sentiment']}\n")


In [None]:
# Assuming 'data' is your DataFrame with 'sentiment' and 'label' columns
# Calculate the proportion of each sentiment category
sentiment_counts = data['sentiment'].value_counts(normalize=True) * 100

# Calculate the proportion of each label
label_counts = data['label'].value_counts(normalize=True) * 100

# Print the results
print("Sentiment Distribution (%):")
print(sentiment_counts)
print("\nLabel Distribution (%):")
print(label_counts)

# For additional insights, we can also look at the cross-tabulation of sentiment and label
crosstab = pd.crosstab(data['sentiment'], data['label'], normalize='index') * 100
print("\nCross-Tabulation of Sentiment and Label (%):")
print(crosstab)

In [None]:
# read for existed csv
import pandas as pd
# truncation
data = pd.read_csv('./data/FinBERT_sentiment/dataset_with_sentiment.csv')
# sliding window
data = pd.read_csv('./data/FinBERT_sentiment/dataset_with_sentiment_sliding_window.csv')

# Convert the 'Date' column to datetime format and sort the dataframe by 'Date'
data['Date'] = pd.to_datetime(data['Date'])
data_sorted = data.sort_values(by='Date')

In [None]:
# 按 'Date' 和 'sentiment' 分组，然后计算每个类别的 category 为news和opinion的数量
category_news_per_day_sentiment = data_sorted[data_sorted['category'] == 'news'].groupby(['Date', 'sentiment']).size().unstack().fillna(0)
category_opinion_per_day_sentiment = data_sorted[data_sorted['category'] == 'opinion'].groupby(['Date', 'sentiment']).size().unstack().fillna(0)
# 分别计算news和opinion的total
category_news_total_per_day_sentiment = data_sorted[data_sorted['category'] == 'news'].groupby(['Date']).size()
category_opinion_total_per_day_sentiment = data_sorted[data_sorted['category'] == 'opinion'].groupby(['Date']).size()

data_sorted = data_sorted.set_index('Date')
data_sorted['P_news_pos'] = category_news_per_day_sentiment['Positive'].reindex(data_sorted.index) / category_news_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_news_neg'] = category_news_per_day_sentiment['Negative'].reindex(data_sorted.index) / category_news_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_op_pos'] = category_opinion_per_day_sentiment['Positive'].reindex(data_sorted.index) / category_opinion_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_op_neg'] = category_opinion_per_day_sentiment['Negative'].reindex(data_sorted.index) / category_opinion_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted = data_sorted.reset_index()

In [None]:
daily_data = data_sorted.groupby('Date').last()

# Shift the 'Open' column to get the next day's opening price
daily_data['Next_Open'] = daily_data['Open'].shift(-1)

# Drop the last row as it will not have a 'Next_Open' value
daily_data = daily_data[:-1]

In [None]:
date_to_query = pd.to_datetime('2016-10-28')
daily_data.loc[(date_to_query)]

In [None]:
date_to_query = pd.to_datetime('2016-10-28')
data_sorted.loc[data_sorted['Date'] == date_to_query]

In [None]:
date_to_query = pd.to_datetime('2020-01-23')
daily_data.loc[(date_to_query)]

In [None]:
date_to_query = pd.to_datetime('2018-05-06')
category_to_query = 'news'
data_sorted.loc[(data_sorted['Date'] == date_to_query) & (data_sorted['category'] == category_to_query)]

In [None]:
daily_data['P_news_neg'].fillna(0, inplace=True)
daily_data['P_news_pos'].fillna(0, inplace=True)
daily_data['P_op_neg'].fillna(0, inplace=True)
daily_data['P_op_pos'].fillna(0, inplace=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the dataset into a Pandas DataFrame
historical_data = pd.read_csv('data/original_datasets/AAPL_Yahoo_Correct.csv')

# Convert the 'Date' column to datetime format
historical_data['Date'] = pd.to_datetime(historical_data['Date'])

# Plotting the 'Open' price against the 'Date'
plt.figure(figsize=(10, 5))
plt.plot(historical_data['Date'], historical_data['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()


In [None]:
# 按照Date将historical_data的全部列和daily_data的这四个P_开头的列合并。如果出现有些天在daily_data中不存在，则四个P_开头的列在这一天都置为0。
daily_data_merged = pd.merge(historical_data, daily_data[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']], left_on='Date', right_index=True, how='left')
daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']] = daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']].fillna(0)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(daily_data_merged['Date'], daily_data_merged['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()

In [None]:
# daily_data_merged.to_csv('./data/dataset_FinBERT.csv')
daily_data_merged.to_csv('./data/dataset_FinBERT_sliding_window.csv')

# Part 1-2: VADER Sentiment Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the dataset into a Pandas DataFrame
historical_data = pd.read_csv('data/original_datasets/AAPL_Yahoo_Correct.csv')

# Convert the 'Date' column to datetime format
historical_data['Date'] = pd.to_datetime(historical_data['Date'])

# Plotting the 'Open' price against the 'Date'
plt.figure(figsize=(10, 5))
plt.plot(historical_data['Date'], historical_data['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()

In [None]:
# read VADER results from csv
vader_daily_results = pd.read_csv('./data/VADER_sentiment/combined_data_mean5.csv')

In [None]:
vader_daily_results['Date'] = pd.to_datetime(vader_daily_results['Date'])
vader_daily_results = vader_daily_results[['Date', 'news_neg', 'news_pos', 'opinion_neg', 'opinion_pos']]
vader_daily_results.columns = ['Date', 'P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']
vader_daily_results.set_index('Date', inplace=True)

daily_data_merged = pd.merge(historical_data, vader_daily_results[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']], left_on='Date', right_index=True, how='left')
daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']] = daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']].fillna(0)

In [None]:
daily_data_merged.head(10)

In [None]:
daily_data_merged.to_csv('./data/dataset_VADER.csv')