In [41]:
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

In [42]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [43]:

df = pd.read_csv(r'..\data\date_price_tick.csv')
df = df.drop_duplicates(subset='title')
sentences = df['title'].tolist()


In [44]:
df

Unnamed: 0,title,Adjusted Close,Date
0,Tech Stocks And FAANGS Strong Again To Start D...,86.856926,10-06-2020
1,10 Biggest Price Target Changes For Wednesday,86.856926,10-06-2020
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",86.856926,10-06-2020
3,"Deutsche Bank Maintains Buy on Apple, Raises P...",86.856926,10-06-2020
4,Apple To Let Users Trade In Their Mac Computer...,86.856926,10-06-2020
...,...,...,...
464,"UBS Maintains Buy on Apple, Lowers Price Targe...",70.051155,10-03-2020
465,123 Biggest Movers From Yesterday,70.051155,10-03-2020
466,Crude Awakening: Energy Sector Takes A 20% Spi...,65.344910,09-03-2020
467,Investor Movement Index Summary: February 2020,65.344910,09-03-2020


In [45]:
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]

# Define a softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum(axis=0)

# Define labels for reference
labels = {0: 'neutral', 1: 'positive', 2: 'negative'}

# Create an empty list to store data
data = []

for idx, sent in enumerate(sentences):
    probabilities = softmax(outputs.detach().numpy()[idx])
    row_data = [sent] + [f'{prob * 100:.2f}%' for prob in probabilities]
    data.append(row_data)

# Define column names for the DataFrame
columns = ['title'] + [f'{label} Probability' for label in labels.values()]

# Create a DataFrame
new_df = pd.DataFrame(data, columns=columns)


In [46]:
new_df 

Unnamed: 0,title,neutral Probability,positive Probability,negative Probability
0,Tech Stocks And FAANGS Strong Again To Start D...,0.00%,100.00%,0.00%
1,10 Biggest Price Target Changes For Wednesday,100.00%,0.00%,0.00%
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",100.00%,0.00%,0.00%
3,"Deutsche Bank Maintains Buy on Apple, Raises P...",0.00%,100.00%,0.00%
4,Apple To Let Users Trade In Their Mac Computer...,100.00%,0.00%,0.00%
...,...,...,...,...
458,"UBS Maintains Buy on Apple, Lowers Price Targe...",0.15%,18.16%,81.69%
459,123 Biggest Movers From Yesterday,30.92%,10.07%,59.01%
460,Crude Awakening: Energy Sector Takes A 20% Spi...,0.08%,0.00%,99.92%
461,Investor Movement Index Summary: February 2020,100.00%,0.00%,0.00%


In [47]:
df

Unnamed: 0,title,Adjusted Close,Date
0,Tech Stocks And FAANGS Strong Again To Start D...,86.856926,10-06-2020
1,10 Biggest Price Target Changes For Wednesday,86.856926,10-06-2020
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",86.856926,10-06-2020
3,"Deutsche Bank Maintains Buy on Apple, Raises P...",86.856926,10-06-2020
4,Apple To Let Users Trade In Their Mac Computer...,86.856926,10-06-2020
...,...,...,...
464,"UBS Maintains Buy on Apple, Lowers Price Targe...",70.051155,10-03-2020
465,123 Biggest Movers From Yesterday,70.051155,10-03-2020
466,Crude Awakening: Energy Sector Takes A 20% Spi...,65.344910,09-03-2020
467,Investor Movement Index Summary: February 2020,65.344910,09-03-2020


In [48]:
# Assuming you have your two DataFrames df and new_df
df_total = pd.merge(df, new_df, right_on='title', left_on='title', how='left')

# Drop duplicates based on the 'title' column



In [50]:
# make df_total a csv file
df_total.to_csv(r'..\data\date_price_tick_sentiment.csv', index=False)



ValueError: Can only compare identically-labeled Series objects