In [28]:
import pandas as pd 
pd.set_option('max_colwidth', None)
pd.options.display.max_rows = 10000
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch
from torch.nn.functional import softmax

In [4]:
file = './BetterCleaned.csv'
df = pd.read_csv(file, low_memory=False)
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d')
df = df.sort_values('created_at')

In [9]:
# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Create a pipeline for sentiment analysis
finbert_sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [22]:
# Example usage
print(finbert_sentiment_pipeline("long term bullish eth cvp bsc binance support bscdefi portfolio cvp bound moon"))

[{'label': 'positive', 'score': 0.7929679155349731}]


In [None]:
df[['full_text', 'clean_text']].tail()#[len(df)//2 : len(df)//2 + 500]

**TESTING ON A SAMPLE FROM SET BEFORE APPLYING TO DF**

In [23]:
# Tokenize the input text and convert to a PyTorch tensor
inputs = tokenizer("long term bullish eth cvp bsc binance support bscdefi portfolio cvp bound moon", return_tensors="pt")

In [24]:
# Get the logits from the model
outputs = model(**inputs)
logits = outputs.logits

In [29]:
# Apply the softmax function to convert logits to probabilities
probabilities = softmax(logits, dim=1)

In [30]:
# Convert the probabilities to a list
probabilities = probabilities.tolist()[0]

In [31]:
# Map each probability to the corresponding label
labels = model.config.id2label
probabilities_with_labels = [(labels[i], prob) for i, prob in enumerate(probabilities)]

print(probabilities_with_labels)

[('positive', 0.7929679155349731), ('negative', 0.015914522111415863), ('neutral', 0.191117525100708)]


**APPLYING TO DF NOW**

In [33]:
def get_sentiment_probabilities(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():  # Disabling gradient calculation for inference
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1).tolist()[0]
    labels = model.config.id2label
    return {labels[i]: prob for i, prob in enumerate(probabilities)}

In [36]:
from tqdm import tqdm

# Clean the DataFrame to ensure all entries are strings
df = df[df['clean_text'].apply(lambda x: isinstance(x, str))]

batch_size = 32
results = []

for i in tqdm(range(0, len(df['clean_text']), batch_size)):
    batch = df['clean_text'][i:i+batch_size].tolist()
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = softmax(outputs.logits, dim=1).tolist()
        results.extend(probabilities)

# Map probabilities to labels and add to DataFrame
df['sentiment_probabilities'] = [dict(zip(model.config.id2label.values(), r)) for r in results]


100%|██████████| 8514/8514 [1:45:46<00:00,  1.34it/s]


In [37]:
df.head()

Unnamed: 0,created_at,like_count,quote_count,reply_count,retweet_count,full_text,coin_id,datasource,clean_text,sentiment_probabilities
0,2021-02-01,154,,18,23,#privacy is a human right. learn how to make your #bitcoin transactions more private in this clip: https://t.co/fnadsxffcu https://t.co/nznajw8g2m,btc,influtweets,privacy human right learn make bitcoin transaction private clip,"{'positive': 0.03522004932165146, 'negative': 0.027227938175201416, 'neutral': 0.9375520348548889}"
1,2021-02-01,65,,8,13,"attended any of the 30+ livestream events iâve held over the last year? together, weâve learned a lot about #bitcoin #ethereum &amp; #lightningnetwork. starting this month iâll be moving them to sundays. subscribe to be notified as they're announced: https://t.co/rgoqfv22mr https://t.co/ec4ugzrilq",amp,influtweets,attended 30 livestream event ive held last year together weve learned lot bitcoin ethereum lightningnetwork starting month ill moving sunday subscribe notified announced,"{'positive': 0.14518015086650848, 'negative': 0.01218145340681076, 'neutral': 0.842638373374939}"
2,2021-02-01,65,,8,13,"attended any of the 30+ livestream events iâve held over the last year? together, weâve learned a lot about #bitcoin #ethereum &amp; #lightningnetwork. starting this month iâll be moving them to sundays. subscribe to be notified as they're announced: https://t.co/rgoqfv22mr https://t.co/ec4ugzrilq",btc,influtweets,attended 30 livestream event ive held last year together weve learned lot bitcoin ethereum lightningnetwork starting month ill moving sunday subscribe notified announced,"{'positive': 0.14518015086650848, 'negative': 0.01218145340681076, 'neutral': 0.842638373374939}"
3,2021-02-01,65,,8,13,"attended any of the 30+ livestream events iâve held over the last year? together, weâve learned a lot about #bitcoin #ethereum &amp; #lightningnetwork. starting this month iâll be moving them to sundays. subscribe to be notified as they're announced: https://t.co/rgoqfv22mr https://t.co/ec4ugzrilq",eth,influtweets,attended 30 livestream event ive held last year together weve learned lot bitcoin ethereum lightningnetwork starting month ill moving sunday subscribe notified announced,"{'positive': 0.14518015086650848, 'negative': 0.012181459926068783, 'neutral': 0.842638373374939}"
4,2021-02-01,1,1.0,1,1,Own shares in both $STC and $WELL,stc,othertweets,share stc well,"{'positive': 0.04716747626662254, 'negative': 0.020153099671006203, 'neutral': 0.9326794147491455}"


In [38]:
df.to_csv('clean_plus_labelled.csv', index=False)