In [None]:
import pandas as pd

df = pd.read_csv('all-data.csv', names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace")
sampled_df = df.sample(n=1500, random_state=42)
sampled_df.to_csv('test.csv', index=False, encoding='utf-8')

# df = pd.read_csv('all-data.csv', names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace")
# df1 = pd.read_csv('gpt_augment.csv', names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace")
# distinct_df1 = df1[~df1.apply(tuple, 1).isin(df.apply(tuple, 1))]
# print(distinct_df1)
# distinct_df1.to_csv('gpt.csv', index=False, encoding='utf-8')

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load FinBERT tokenizer and model
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)  # 3 classes: positive, neutral, negative

# Define device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finbert_model.to(device)

# Load data from CSV for training
def load_data(csv_file):
    df = pd.read_csv(csv_file, names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace", header=0)
    texts = df['text'].tolist()
    sentiments = df['sentiment'].tolist()

    # Label mapping for sentiment categories
    label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
    labels = [label_mapping[sentiment] for sentiment in sentiments]

    # Tokenize the input texts with padding and truncation
    inputs = finbert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    labels = torch.tensor(labels)

    # Create TensorDataset
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return dataset

# Create DataLoader for training and evaluation
def create_dataloader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training function
def train_model(model, train_dataloader, optimizer, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predictions
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['neutral', 'positive', 'negative'])

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print("Classification Report:\n", report)

# Load data for training
train_dataset = load_data('combined_synthetic_data.csv')
train_dataloader = create_dataloader(train_dataset, batch_size=16)

# Load data for evaluation
eval_dataset = load_data('test.csv')
eval_dataloader = create_dataloader(eval_dataset, batch_size=16)

# Define optimizer (you can adjust learning rate and parameters)
optimizer = AdamW(finbert_model.parameters(), lr=2e-5)

# Train the model
train_model(finbert_model, train_dataloader, optimizer, epochs=3)

# Evaluate the model
evaluate_model(finbert_model, eval_dataloader)



model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Epoch 1/3, Loss: 0.8411
Epoch 2/3, Loss: 0.0536
Epoch 3/3, Loss: 0.0015
Accuracy: 75.67%
Classification Report:
               precision    recall  f1-score   support

     neutral       0.75      0.92      0.83       877
    positive       0.85      0.44      0.58       438
    negative       0.71      0.72      0.72       185

    accuracy                           0.76      1500
   macro avg       0.77      0.70      0.71      1500
weighted avg       0.77      0.76      0.74      1500



In [None]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
pip install cleanlab

Collecting cleanlab
  Downloading cleanlab-2.7.0-py3-none-any.whl.metadata (60 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/60.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading cleanlab-2.7.0-py3-none-any.whl (347 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cleanlab
Successfully installed cleanlab-2.7.0


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

# Load FinBERT tokenizer and model
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Set the model to evaluation mode
finbert_model.eval()

# Load CSV file into pandas DataFrame
df = pd.read_csv('combined_synthetic_data.csv', names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace", header=0)

# Ensure the CSV has 'Text' and 'Sentiment' columns
texts = df['text'].tolist()
sentiments = df['sentiment'].tolist()

# Assuming the sentiment labels are in the form of 'neutral', 'positive', 'negative'
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
labels = [label_mapping[sentiment] for sentiment in sentiments]

# Tokenize the input texts with padding and truncation
inputs = finbert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Convert tokenized inputs to tensors
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
labels = torch.tensor(labels)

# Create a TensorDataset and DataLoader for evaluation
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed

# Define the evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Compute predictions
            predictions = torch.argmax(logits, dim=-1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    return accuracy

# Run evaluation
accuracy = evaluate_model(finbert_model, dataloader)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 82.78%


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

# Load FinBERT tokenizer and model
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Set the model to evaluation mode
finbert_model.eval()

# Load CSV file into pandas DataFrame
df = pd.read_csv('/content/all-data.csv', encoding='ISO-8859-1')

# Ensure the CSV has 'Text' and 'Sentiment' columns
texts = df['Text'].tolist()
sentiments = df['Sentiment'].tolist()

# Map sentiments to numeric labels (neutral: 0, positive: 1, negative: 2)
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
labels = [label_mapping[sentiment] for sentiment in sentiments]

# Tokenize the input texts with padding and truncation
inputs = finbert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Convert tokenized inputs to tensors
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
labels = torch.tensor(labels)

# Create a TensorDataset and DataLoader for evaluation
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed

# Store predicted probabilities and true labels for Cleanlab
all_probs = []
all_labels = []

# Generate predictions and probabilities from FinBERT
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask, labels_batch = batch
        outputs = finbert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Apply softmax to get probabilities
        probs = torch.softmax(logits, dim=1).cpu().numpy()

        # Store the probabilities and true labels
        all_probs.append(probs)
        all_labels.append(labels_batch.cpu().numpy())

# Concatenate the probabilities and labels across all batches
all_probs = np.concatenate(all_probs, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Calculate self-confidence scores (highest class probability for each sample)
self_confidence_scores = all_probs.max(axis=1)

# Rank indices by confidence and select the bottom 5%
num_low_confidence = int(len(self_confidence_scores) * 0.05)
sorted_indices = np.argsort(self_confidence_scores)  # Ascending order (lowest confidence first)
low_confidence_indices = sorted_indices[:num_low_confidence]

# Output the ranking of indices by confidence and the 5% with the lowest confidence
ranked_confidence = list(zip(sorted_indices, self_confidence_scores[sorted_indices]))
print(f"Ranking of indices by confidence (ascending): {ranked_confidence}")
print(f"Indices with the lowest 5% confidence: {low_confidence_indices}")

# Optionally, get a DataFrame of the lowest-confidence samples for inspection
low_confidence_df = df.iloc[low_confidence_indices]
print(low_confidence_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# The list of indices of potential label issues
label_issues = [441, 368, 8, 96, 150, 336, 413, 158, 168, 328, 341, 184, 402, 318, 160, 192,
                316, 394, 231, 397, 46, 449, 19, 116, 124, 140, 209, 109, 71, 330, 445, 418,
                263, 374, 235, 165, 382, 380, 407, 173, 206, 240, 118, 176, 87, 342, 55,
                213, 142, 147, 221, 207, 200, 97, 218, 344, 360, 185, 6, 59, 355, 435, 216,
                37, 329, 311, 350, 183, 233, 44, 388, 86, 248, 423, 384, 396, 411, 446, 389,
                99, 152, 259, 217, 323, 141, 204, 43, 33, 399, 188, 282, 111, 27, 434, 103,
                38, 189, 308, 31, 393, 1, 227, 379, 290, 132, 16, 244, 133, 287, 370]

# Count the number of indices
total_issues = len(label_issues)

# Print the result
print(f"Total number of potential label issues: {total_issues}")
