In [1]:
import pyarrow.parquet as pq
import pandas as pd
import re
from tqdm import tqdm  
from transformers import BertTokenizer, pipeline
import torch




In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'   

### Load dataset

In [4]:
filepath = './Dataset/bloomberg_news_quarter.csv'
df = pd.read_csv(filepath)
df = df.drop(df.columns[0], axis=1)
df.rename(columns={'0':'times', '1':'articles'}, inplace=True)
print(df)

             times Quarter_times  \
0       2006-10-20    2006-01-01   
1       2006-10-21    2006-01-01   
2       2006-10-23    2006-01-01   
3       2006-10-23    2006-01-01   
4       2006-10-24    2006-01-01   
...            ...           ...   
448390  2013-11-26    2013-01-01   
448391  2013-11-26    2013-01-01   
448392  2013-11-26    2013-01-01   
448393  2013-11-26    2013-01-01   
448394  2013-11-26    2013-01-01   

                                                 articles  
0       -- Inco's Net Soars on Higher Metal Prices, Br...  
1       -- Jim Cramer: Diageo, Anheuser-Busch, Monster...  
2       -- EU Energy Chief Backs German Plan for Price...  
3       -- Ex-Plant Worker Shuster Pleads Guilty in Tr...  
4       -- Jim Cramer: Bare Escentuals, Allergan, Medi...  
...                                                   ...  
448390  -- Rubber Drops to Two-Week Low After Forecast...  
448391  -- SNB’s Jordan Sees No Reason to Remove Cap o...  
448392  -- U.K. Lawmakers S

In [5]:
txt_length = df['articles'].apply(lambda x: len(x) if pd.notnull(x) else 0)
df['articles_len'] = txt_length

In [6]:
df['articles'].fillna('', inplace=True) 

In [18]:
from torch.utils.data import DataLoader

def Calculate_sentiment_score(text, max_length):
    # Initializing the BERT tokenizer and sentiment analysis model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    sentiment_analysis = pipeline("sentiment-analysis", model="ProsusAI/finbert", batch_size=8, device=device)

    # Split text
    words = text.split()
    subtexts = ['']
    for word in words:
        if len(tokenizer.encode(subtexts[-1] + word)) < max_length:
            subtexts[-1] += word + ' '
        else:
            subtexts.append(word + ' ')

    # Create a DataFrame to store the results
    Sentiment_df = pd.DataFrame(columns=['subtext', 'label', 'score', 'length', 'weighted_score'])

    # Perform sentiment analysis on each text segment
    for subtext in subtexts:
        if subtext.strip():  # Ignore empty text segments
            result = sentiment_analysis(subtext)[0]
            length = len(subtext.split())  # Calculate the length of the text segment (number of words)
            weighted_score = result['score'] * length  # Calculate the weighted score
            new_data = {'subtext': subtext, 'label': result['label'], 'score': result['score'], 'length': length, 'weighted_score': weighted_score}
            new_df = pd.DataFrame([new_data])  # Create a DataFrame containing the new data
            # Use the concat method to add the new DataFrame to the original DataFrame
            df_list = [Sentiment_df, new_df]
            Sentiment_df = pd.concat([df for df in df_list if not df.empty])

    return Sentiment_df

def AVG_Sentiment_score(df):
    total_length = df['length'].sum()
    total_score = 0
    
    for index, row in df.iterrows():
        if row['label'] == 'negative':
            total_score -= row['weighted_score']
        elif row['label'] == 'positive':
            total_score += row['weighted_score']
        
    weighted_average_score = total_score / total_length if total_length > 0 else 0
    
    return weighted_average_score


In [19]:
Sentiment_Scores = []
last_month = None

for times, group in df.groupby('times'):
    current_month = times[5:7]  # Get the current month
    score_perday = []
    if current_month != last_month:
        print("The current month is different from the previous month")
        # Save sentiment scores to a local CSV file by month
        file_name = f"./FinBERT/sentiment_scores_{pd.to_datetime(times).strftime('%Y-%m-%d')}.csv"
        Sentiment_Scores_df = pd.DataFrame(Sentiment_Scores, columns=['sentiment_scores','times'])
        Sentiment_Scores_df.to_csv(file_name)
        print("Saved")
        
        # Reset the Sentiment_Scores list to store scores for the next month
        Sentiment_Scores = []
        last_month = current_month  # Save the current month as the last month
        
    for article in group['articles']:        
        max_length = 500
        Sentiment_df = Calculate_sentiment_score(article, max_length)  # Calculate sentiment scores for each article
        print(Sentiment_df)
        weighted_average_score = AVG_Sentiment_score(Sentiment_df)  # Calculate the weighted average score for each article
        score_perday.append(weighted_average_score) 
        print(f"Time: {times}, Weighted Average Score: {weighted_average_score}")
    # Append the list of sentiment scores for a day
    Sentiment_Scores.append([score_perday,times])


当前月份与上一个月份不同
Saved
                                             subtext     label     score  \
0  -- Inco's Net Soars on Higher Metal Prices, Br...  positive  0.792133   
0  Vale. Inco's nickel production climbed to 125 ...  negative  0.960954   
0  Chicago at dcrofts@bloomberg.net . To contact ...   neutral  0.931295   

   length  weighted_score  
0     313      247.937658  
0     374      359.396688  
0      16       14.900726  
Time: 2006-10-20, Weighted Average Score: -0.15854769614478773
                                             subtext    label     score  \
0  -- Jim Cramer: Diageo, Anheuser-Busch, Monster...  neutral  0.756490   
0  trends are positive. He said to buy shares of ...  neutral  0.909033   

   length  weighted_score  
0     301      227.703631  
0     154      139.991156  
Time: 2006-10-21, Weighted Average Score: 0.0
                                             subtext     label     score  \
0  -- EU Energy Chief Backs German Plan for Price...  negative  0.817

KeyboardInterrupt: 