In [1]:
import pandas as pd
import os
import torch
from transformers import BertForSequenceClassification, BertTokenizer


In [2]:
print(torch.cuda.is_available())

True


In [3]:
class BertClassifier:

    def __init__(self, model_name):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        model = BertForSequenceClassification.from_pretrained(model_name)
        model.to(self.device)
        model.eval()

        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = model

    def predict(self,text,threshold = 0.5):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs.to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)

        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).squeeze().tolist()

        predicted_class = torch.argmax(logits, dim=1).item()
        if probabilities[predicted_class] <= threshold and predicted_class == 1:
            predicted_class = 0

        return predicted_class, probabilities


### Cargo modelos

In [4]:
veracity_model_name = 'VerificadoProfesional/SaBERT-Spanish-Fake-News'
sentiment_model_name = 'VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis'

In [5]:
veracity_model = BertClassifier(veracity_model_name)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [6]:
sentiment_model = BertClassifier(sentiment_model_name)

cuda


config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

### Cargo set de datos

In [None]:
data_veracity = pd.read_csv("./veracity_global.csv")
data_veracity_arg = pd.read_csv("./veracity_arg.csv")

In [14]:
data_sentiment = pd.read_csv("./sentiment.csv")

In [11]:
def predict_on_text(data,model):
    clases = []
    predictions = []
    for texto in data:
        clase,prediction = model.predict(texto)
        clases.append(clase)
        predictions.append(prediction)
    return clases,predictions


### Predicciones sobre los datasets

In [12]:
def predict_on_dataset(model,dataset,name):
    label,predictions = predict_on_text(dataset["texto"],model)
    dataset[f"{name}_label"] = label
    dataset[f"{name}_probabilities"] =  predictions
    return dataset


In [None]:
data_veracity_arg = predict_on_dataset(sentiment_model,data_veracity_arg,"sentiment")
data_veracity_arg = predict_on_dataset(veracity_model,data_veracity_arg,"veracity")

In [None]:
data_veracity_arg.head(5)

Unnamed: 0,clasificacion,texto,sentiment_probabilities,sentiment_label,veracity_label,veracity_probabilities
0,1,No se divulgaron resultados oficiales previos ...,"[0.9999589920043945, 4.104002800886519e-05]",0,0,"[0.711641788482666, 0.2883581817150116]"
1,0,Se difundieron en redes sociales supuestos pri...,"[0.00012985366629436612, 0.9998701810836792]",1,0,"[0.6109246611595154, 0.389075368642807]"
2,1,"En el balotaje 2023, las boletas con el numero...","[0.5204170942306519, 0.47958290576934814]",0,1,"[0.30963167548179626, 0.6903683543205261]"
3,0,"En el balotaje 2023, las boletas con el numero...","[0.9994339346885681, 0.0005660828901454806]",0,1,"[0.3529888093471527, 0.6470111608505249]"
4,0,"Miles de usuarios denuncian que, durante la jo...","[0.9999581575393677, 4.186057412880473e-05]",0,1,"[0.3470920920372009, 0.6529079079627991]"


In [None]:
data_veracity_arg.to_csv('veracity_arg_predicted.csv', index=False)

In [None]:
data_veracity= predict_on_dataset(sentiment_model,data_veracity,"sentiment")

In [None]:
data_veracity = predict_on_dataset(veracity_model,data_veracity,"veracity")

In [None]:
data_veracity.head(5)

Unnamed: 0,clasificacion,texto,sentiment_label,sentiment_probabilities,veracity_label,veracity_probabilities
0,1,El presidente abre la puerta a unos comicios e...,0,"[0.9996234178543091, 0.00037651995080523193]",1,"[0.40753498673439026, 0.5924650430679321]"
1,1,"En un escrito, la abogada del Estado Rosa Mari...",0,"[0.9999406337738037, 5.938944377703592e-05]",1,"[0.07654518634080887, 0.9234548211097717]"
2,0,Este lunes y martes la Asamblea de Madrid acog...,1,"[9.01715611689724e-05, 0.9999098777770996]",1,"[0.25899437069892883, 0.7410055994987488]"
3,1,Resulta evidente que la ley ha tenido algunos ...,0,"[0.9999659061431885, 3.413212834857404e-05]",1,"[0.44350090622901917, 0.5564991235733032]"
4,1,"El hispanista, que ya ocupo un puesto simbolic...",1,"[0.00020081509137526155, 0.9997991919517517]",1,"[0.37994152307510376, 0.6200584173202515]"


In [None]:
data_veracity.to_csv('veracity_predicted.csv', index=False)

In [15]:
data_sentiment = predict_on_dataset(sentiment_model,data_sentiment,"sentiment")

In [16]:
data_sentiment.to_csv('sentiment_predicted.csv', index=False)