In [None]:
pip install transformers

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    sentences = df['text'].values
    labels = df['label'].values
    return sentences, labels

def tokenize_inputs(tokenizer, sentences, labels):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels)
    return inputs

def train_model(model, optimizer, loss_fn, train_inputs, train_labels, attention_mask_train, validation_inputs, validation_labels, attention_mask_val, device, epochs):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        train_inputs = train_inputs.to(device)
        train_labels = train_labels.to(device)
        attention_mask_train = attention_mask_train.to(device)
        output = model(input_ids=train_inputs, attention_mask=attention_mask_train, labels=train_labels)
        loss = output.loss
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            validation_inputs = validation_inputs.to(device)
            validation_labels = validation_labels.to(device)
            attention_mask_val = attention_mask_val.to(device)
            output = model(input_ids=validation_inputs, attention_mask=attention_mask_val)
            val_loss = loss_fn(output.logits, validation_labels)
            print(f"Epoch {epoch+1}: Train Loss = {loss.item()}, Val Loss = {val_loss.item()}")

def evaluate_model(model, test_inputs, test_labels, attention_mask_test, device):
    model.to(device)
    model.eval()
    with torch.no_grad():
        test_inputs = test_inputs.to(device)
        test_labels = test_labels.to(device)
        attention_mask_test = attention_mask_test.to(device)
        output = model(input_ids=test_inputs, attention_mask=attention_mask_test)
        loss = loss_fn(output.logits, test_labels)
        preds = torch.argmax(output.logits, dim=1)
        accuracy = torch.sum(preds == test_labels) / len(test_labels)
        print(f"Test Loss = {loss.item()}, Test Accuracy = {accuracy.item()}")
        return {"tensor([0])": "Animals", "tensor([1])": "Politics", "tensor([2])": "Sports", "tensor([3])": "Technology"}.get(str(preds))

def write_output(model, test_sentences, test_inputs, attention_mask_test, device, test_labels=None):
    test_sentences = test_sentences.tolist()
    model.to(device)
    model.eval()
    with torch.no_grad():
        with open("/output.txt", "w", encoding="utf-8") as w:
            test_inputs = test_inputs.to(device)
            attention_mask_test = attention_mask_test.to(device)
            output = model(input_ids=test_inputs, attention_mask=attention_mask_test)
            preds = torch.argmax(output.logits, dim=1)
            preds_list = preds.tolist()
            dict = {0: "Animals", 1: "Politics", 2: "Sports", 3: "Technology"}
            for i in range(len(test_sentences)):
                w.write(f"Sentence: {test_sentences[i]}, Category: {dict.get(preds_list[i])}\n")
            if test_labels != None:
                acc = torch.sum(preds == test_labels) / len(test_labels)
                w.write(f"\n\nAccuracy = {acc.item()}")

def sentence_category(model, device):
    sentence = [str(input("Type the sentence you want to test the model: "))]
    label = [int(input("Type the label of the previous sentence [0 (Animals), 1 (Politics), 2 (Sports), 3 (Technology)]: "))]
    test = tokenize_inputs(tokenizer, sentence, label)
    test_inputs, test_labels = test['input_ids'], test['labels']
    attention_mask_test = test['attention_mask']
    print(f"Sentence: {sentence[0]}, Category: {evaluate_model(model, test_inputs, test_labels, attention_mask_test, device)}")

In [None]:
# Tokenizar los inputs y agregar las etiquetas
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Cargar los datos de entrenamiento
train_sentences, train_labels = load_data("/train.csv")
train = tokenize_inputs(tokenizer, train_sentences.tolist(), train_labels)
train_inputs, train_labels = train['input_ids'], train['labels']
attention_mask_train = train['attention_mask']

In [None]:
# Cargar los datos de validación
validate_sentences, validate_labels = load_data("/validate.csv")
validation = tokenize_inputs(tokenizer, validate_sentences.tolist(), validate_labels)
validation_inputs, validation_labels = validation['input_ids'], validation['labels']
attention_mask_val = validation['attention_mask']

In [None]:
# Cargar los datos de test
test_sentences, test_labels = load_data("/test.csv")
test = tokenize_inputs(tokenizer, test_sentences.tolist(), test_labels)
test_inputs, test_labels = test['input_ids'], test['labels']
attention_mask_test = test['attention_mask']

In [None]:
# Definir el modelo, la función de pérdida y el optimizador
num_labels = len(set(train_labels))
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, output_hidden_states=True)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Entrenar el modelo
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_model(model, optimizer, loss_fn, train_inputs, train_labels, attention_mask_train, validation_inputs, validation_labels, attention_mask_val, device, epochs)

Epoch 1: Train Loss = 4.690629482269287, Val Loss = 4.5714592933654785
Epoch 2: Train Loss = 4.573722839355469, Val Loss = 4.44631814956665
Epoch 3: Train Loss = 4.4427971839904785, Val Loss = 4.291373252868652
Epoch 4: Train Loss = 4.303201198577881, Val Loss = 4.098897933959961
Epoch 5: Train Loss = 4.122618198394775, Val Loss = 3.9237029552459717
Epoch 6: Train Loss = 3.9489762783050537, Val Loss = 3.783336639404297
Epoch 7: Train Loss = 3.8065996170043945, Val Loss = 3.6169421672821045
Epoch 8: Train Loss = 3.6368775367736816, Val Loss = 3.4404549598693848
Epoch 9: Train Loss = 3.4653029441833496, Val Loss = 3.2829253673553467
Epoch 10: Train Loss = 3.30116605758667, Val Loss = 3.1277737617492676


In [None]:
# Evaluar el modelo
evaluate_model(model, test_inputs, test_labels, attention_mask_test, device)

Test Loss = 3.1580255031585693, Test Accuracy = 1.0


In [None]:
# Escribir una oración y label por teclado. Dice si el modelo ha acertado o no y el label que ha asignado.
sentence_category(model, device)    

In [None]:
# Asigna una label a cada oración de test y lo escribe en un fichero de texto. Si se añaden los labels de test, escribe también la accuracy del modelo.
write_output(model, test_sentences, test_inputs, attention_mask_test, device, test_labels)