In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import numpy as np

class TextProbDataset(Dataset):
    def __init__(self, text_prob_map, tokenizer, max_length=128):
        self.texts = []
        self.probs = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        for text, prob_list in text_prob_map.items():
            self.texts.append(text)
            self.probs.append(prob_list)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        probs = self.probs[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'probs': torch.tensor(probs, dtype=torch.float32)
        }

class BertProbClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Freeze BERT parameters (optional)
        for param in self.bert.parameters():
            param.requires_grad = False

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 3),
            nn.Softmax(dim=1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

def train_model(model, train_loader, num_epochs=5, learning_rate=1e-4, device='cuda'):
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            true_probs = batch['probs'].to(device)

            optimizer.zero_grad()
            predicted_probs = model(input_ids, attention_mask)
            loss = criterion(predicted_probs, true_probs)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

def predict(model, text, tokenizer, device='cuda', max_length=128):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        predictions = model(input_ids, attention_mask)

    return predictions.cpu().numpy()[0]

# Example usage
def main():
    # Sample data
    text_prob_map = {
        "This is a positive review": [0.8, 0.1, 0.1],
        "This is a negative review": [0.1, 0.8, 0.1],
        "This is a neutral review": [0.1, 0.1, 0.8]
    }

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertProbClassifier()

    # Create dataset and dataloader
    dataset = TextProbDataset(text_prob_map, tokenizer)
    train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Train the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_model(model, train_loader, device=device)

    # Make predictions
    test_text = "This is a new review"
    predictions = predict(model, test_text, tokenizer, device)
    print(f"Predictions for '{test_text}': {predictions}")

if __name__ == "__main__":
    main()

Epoch 1/5, Average Loss: 0.1100
Epoch 2/5, Average Loss: 0.1011
Epoch 3/5, Average Loss: 0.1189
Epoch 4/5, Average Loss: 0.1062
Epoch 5/5, Average Loss: 0.1187
Predictions for 'This is an extremely positive review': [0.31238875 0.37285504 0.31475627]


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Cargar el modelo y el tokenizador
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Preparar el texto de entrada
text = "El clima es hermoso hoy"
inputs = tokenizer(text, return_tensors="pt")

# Realizar la inferencia
outputs = model(**inputs)
logits = outputs.logits

# Obtener la predicción
predicted_class = torch.argmax(logits).item()
print(predicted_class)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
