In [22]:
# Packages
import os
import numpy as np
import pandas as pd
import random
import time
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
from math import ceil
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModel
import torch


#pip install requests
import requests

# Omit Warnings
import warnings
warnings.filterwarnings("ignore")

In [18]:
df =  pd.read_csv(r"C:\Users\braul\OneDrive\Desktop\Academia\TO NEW BEGINNINGS\2nd Semester 2024-2025\Text Mining\TM Project\test.csv")
texts = df["text"].astype(str).tolist()

In [20]:
# Load FinBERT
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Replace 'text' with the actual column name containing your sentences
df['sentiment'] = df['text'].apply(lambda x: nlp(str(x))[0]['label'])

# Show sample
print(df.head())

Device set to use cpu


   id                                               text sentiment
0   0  ETF assets to surge tenfold in 10 years to $50...  positive
1   1  Here’s What Hedge Funds Think Evolution Petrol...   neutral
2   2  $PVH - Phillips-Van Heusen Q3 2020 Earnings Pr...   neutral
3   3  China is in the process of waiving retaliatory...  negative
4   4  Highlight: “When growth is scarce, investors s...   neutral


In [23]:
# BertTweet model
# Load BERTweet tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
model = AutoModel.from_pretrained("vinai/bertweet-base")

# Set model to evaluation mode and disable gradients
model.eval()

# Store embeddings
embeddings = []

for sentence in df['text'].fillna("").astype(str):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the [CLS]-token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    embeddings.append(cls_embedding)

# Convert list of embeddings into a DataFrame (optional)
embedding_df = pd.DataFrame(embeddings)

# Combine with original data
df_with_embeddings = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)

# Save the result (optional)
df_with_embeddings.to_csv("bertweet_embeddings.csv", index=False)

# Display a sample
print(df_with_embeddings.head())

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


   id                                               text sentiment         0  \
0   0  ETF assets to surge tenfold in 10 years to $50...  positive  0.025032   
1   1  Here’s What Hedge Funds Think Evolution Petrol...   neutral -0.125657   
2   2  $PVH - Phillips-Van Heusen Q3 2020 Earnings Pr...   neutral  0.003956   
3   3  China is in the process of waiving retaliatory...  negative  0.121045   
4   4  Highlight: “When growth is scarce, investors s...   neutral -0.045187   

          1         2         3         4         5         6  ...       758  \
0  0.124286  0.162430 -0.086517  0.069848 -0.056850  0.050488  ...  0.097884   
1  0.351593  0.167111 -0.254394  0.069134 -0.130759  0.069325  ... -0.030412   
2  0.238421  0.154213  0.121475 -0.053822 -0.110292  0.138608  ...  0.111118   
3  0.297020  0.324001 -0.098322 -0.111960 -0.158160  0.205593  ... -0.036597   
4  0.194236  0.091016 -0.081666  0.047738  0.051145  0.068032  ...  0.063630   

        759       760       761       

In [24]:
# Load FinTwitBERT pipeline
pipe = pipeline("fill-mask", model="StephanAkkerman/FinTwitBERT", framework="pt")

# Function to apply the model to each row
def apply_mask_prediction(text):
    if "[MASK]" not in text:
        return None  # Skip rows without a mask
    try:
        result = pipe(text)
        # Return the top predicted token (you can customize this)
        return result[0]['token_str']
    except Exception as e:
        return f"Error: {e}"

# Apply the function to the dataframe
df['finbert_prediction'] = df['text'].astype(str).apply(apply_mask_prediction)

# Save the result (optional)
df.to_csv("fintwitbert_predictions.csv", index=False)

# Show a sample
print(df[['text', 'finbert_prediction']].head())


Device set to use cpu


                                                text finbert_prediction
0  ETF assets to surge tenfold in 10 years to $50...               None
1  Here’s What Hedge Funds Think Evolution Petrol...               None
2  $PVH - Phillips-Van Heusen Q3 2020 Earnings Pr...               None
3  China is in the process of waiving retaliatory...               None
4  Highlight: “When growth is scarce, investors s...               None
