In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch


In [2]:
# Step 2: Load the FINBERT model
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [4]:
import os

# Specify the current file name
current_file_name = "shadow_minutes 9_rajat.csv"

# Specify the new file name
new_file_name = "test.csv"

# Rename the file
os.rename(current_file_name, new_file_name)

In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("test.csv")

# Print the column names
print("Column Names:")
print(df.columns)

Column Names:
Index(['Sentences', 'Dominant Sentiment', 'Reason'], dtype='object')


In [6]:
# Step 3: Load and preprocess your data
def preprocess_data(csv_file):
    df = pd.read_csv(csv_file)
    # Your preprocessing steps here
    return df

# Step 4: Perform sentiment analysis
def perform_sentiment_analysis(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).detach().numpy()
    return probabilities

# Step 5: Evaluate the results

# Example usage
csv_file = "/content/test.csv"
data = preprocess_data(csv_file)
sentiments = []
positive_probs = []
negative_probs = []
neutral_probs = []
for text in data['Sentences']:
    probabilities = perform_sentiment_analysis(text)
    positive_prob = round(probabilities[0][1], 2)  # Round to two decimal places
    negative_prob = round(probabilities[0][0], 2)  # Round to two decimal places
    neutral_prob = round(probabilities[0][2], 2)

    if positive_prob > negative_prob and positive_prob > neutral_prob:
        sentiment = "positive"
    elif negative_prob > positive_prob and negative_prob > neutral_prob:
        sentiment = "negative"
    else:
        sentiment = "neutral"

    sentiments.append(sentiment)
    positive_probs.append(float(positive_prob))  # Convert numpy float32 to Python float
    negative_probs.append(float(negative_prob))  # Convert numpy float32 to Python float
    neutral_probs.append(float(neutral_prob))  # Convert numpy float32 to Python float

data['sentiment'] = sentiments
data['positive_prob'] = positive_probs
data['negative_prob'] = negative_probs
data['neutral_prob'] = neutral_probs

print(data)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                                             Sentences Dominant Sentiment  \
0    There is an unprecedented shock to every econo...          Negative    
1    It practically immobilizes the workforce and t...          Negative    
2    As in the previous Financial Crisis, the Monet...            Neutral   
3    RBI must ensure that key Institutions (like Ba...            Neutral   
4    The last also requires close Monetary-Fiscal C...            Neutral   
..                                                 ...                ...   
158  While the current fiscal and monetary actions ...           Positive   
159  Going forward, more fiscal and monetary measur...            Neutral   
160  Right now, control and coordination to prevent...            Neutral   
161  RBI's initiatives are obviously in the right d...           Positive   
162  Only issue for discussion is if the measures a...            Neutral   

                                                Reason sentiment  \
0      

In [24]:
# Example usage
statement = input("Enter a statement: ")
sentiment, probabilities = perform_sentiment_analysis(statement)
print(f"Predicted Sentiment: {sentiment}")
print(f"Probability (Positive): {round(probabilities[0][1],2)}")
print(f"Probability (Negative): {round(probabilities[0][0],2)}")
print(f"Probability (Neutral): {round(probabilities[0][2],2)}")


Enter a statement: inflation is rising 
Predicted Sentiment: Negative
Probability (Positive): 0.07000000029802322
Probability (Negative): 0.6200000047683716
Probability (Neutral): 0.3100000023841858


In [26]:
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [28]:
# Step 2: Load and Preprocess New Data
# Load your new dataset
new_dataset_path = "/content/Dataset_FT.csv"
new_df = pd.read_csv(new_dataset_path)


In [31]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("Dataset_FT.csv")

# Get the column names
column_names = df.columns.tolist()

# Print the column names
print("Column Names:", column_names)

Column Names: ['sentences', 'Polarity', 'Reason']


In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load your new dataset
new_dataset_path = "/content/Dataset_FT.csv"
new_df = pd.read_csv(new_dataset_path)

# Encode labels to numerical values
label_encoder = LabelEncoder()
new_df['Polarity'] = label_encoder.fit_transform(new_df['Polarity'])

# Preprocess the data
# Tokenize and encode the text data using the tokenizer
max_length = 128
encoded_data = tokenizer(new_df['sentences'].tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
labels = torch.tensor(new_df['Polarity'].tolist())

# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(encoded_data.input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks = train_test_split(encoded_data.attention_mask, test_size=0.2, random_state=42)



In [34]:
# Create PyTorch Dataset and DataLoader
class NewDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'label': self.labels[idx]
        }

batch_size = 32
train_dataset = NewDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = NewDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [36]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
# Step 3: Fine-tune the Model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

In [39]:
# Step 4: Evaluate the Model
model.eval()  # Set the model to evaluation mode

total_correct = 0
total_samples = 0
total_loss = 0.0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)

        # Compute loss
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item() * input_ids.size(0)  # Multiply by batch size

        # Compute accuracy
        _, predicted = torch.max(outputs.logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

# Calculate average loss and accuracy
average_loss = total_loss / total_samples
accuracy = total_correct / total_samples

print(f"Validation Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")

Validation Loss: 1.4958, Accuracy: 0.5882


In [38]:
# Step 5: Save the Fine-tuned Model
torch.save(model.state_dict(), 'finetuned_model_on_new_dataset.pth')


In [51]:
# Assume you have a fine-tuned model named 'model' and a tokenizer named 'tokenizer'

# Step 1: Preprocess the Statement
statement = " FDI is decreasing. "

# Tokenize and encode the statement
encoded_statement = tokenizer(statement, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

# Step 2: Inference
with torch.no_grad():
    input_ids = encoded_statement['input_ids'].to(device)
    attention_mask = encoded_statement['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

# Step 3: Interpret the Results
# Get the predicted class
_, predicted = torch.max(outputs.logits, 1)
predicted_class = predicted.item()

# Get the probabilities for each class
probabilities = torch.softmax(outputs.logits, dim=1).squeeze().tolist()
positive_prob = probabilities[1]
negative_prob = probabilities[0]
neutral_prob = probabilities[2]

# Determine sentiment based on probabilities
if predicted_class == 0:
    sentiment = "negative"
elif predicted_class == 1:
    sentiment = "positive"
else:
    sentiment = "neutral"

# Print results
print("Predicted Sentiment:", sentiment)
print("Probability (Positive):", positive_prob)
print("Probability (Negative):", negative_prob)
print("Probability (Neutral):", neutral_prob)

Predicted Sentiment: positive
Probability (Positive): 0.9429779052734375
Probability (Negative): 0.02000230737030506
Probability (Neutral): 0.03701980039477348
