In [131]:
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import Counter
from scipy.special import softmax
import pandas as pd
import numpy as np

datasets_folder = "../datasets"
file_name = "tupi_binary.csv"

In [132]:
# Construct the full path to the CSV file
csv_file_path = os.path.join(datasets_folder, file_name)

# Check if the file exists before attempting to read it
if os.path.isfile(csv_file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

In [133]:

# Assuming 'text_column' is the column containing the text in your dataset
text_column = 'text'

# List of terms to exclude
exclude_terms = {
    'desse', 'aí', 'n', 'https', '@', 'user', 'link', '#', '??', '!!', '_:', '.:', '!:', '? ?', '! !', '_ :', '! :', '? :', 'rt',
    'ta', 'tá', 'q', 'pq', 'ter', 'pra', 'vcs', 'todos', 'aí', 'nunca', 'fala', 'ver', 'coisa', 'desse', 'todo', 'quer', 'agora', 'faz',
    'n', 'fazer', 'ainda', 'dia', 'pode', 'tudo', 'nao', 'nada', 'vc', 'vai', 'pq', 'por que', 'porque', 'eh', 'ne', 'né', 'é', 'p',
    'la', 'lá', 'ai', 'aí', 'to', 'tô','sobre','fez','pois','onde','aqui','pro','dar','ficar','fica','d','[]'
}

import re

# Function to preprocess text by excluding terms
def preprocess_text(text):
    # Split the text into words using regular expression
    words = re.findall(r'\b\w+\b', text)

    # Exclude terms in a case-insensitive manner
    filtered_words = [word for word in words if word.lower() not in exclude_terms]

    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

# Apply the preprocessing function to the 'text_column' in your DataFrame
df['text'] = df['text'].apply(preprocess_text)

In [134]:
df.head()

Unnamed: 0,source,id,text,researcher,year,aggressive,hate
0,twitter,1.65848623693028e+18,quanto pagava na época da como diz canetada el...,oliveira et al,2023,1,1
1,twitter,1.65848623777333e+18,os árabes já vão lhes chutar do país,oliveira et al,2023,1,1
2,twitter,1.65848960585394e+18,tem que desenhar e explicar o desenho retardad...,oliveira et al,2023,1,1
3,twitter,1.65849012716374e+18,chola mais gado e se não quiser pagar mais bar...,oliveira et al,2023,1,1
4,twitter,1.65849018793945e+18,michele micheque tinha cartao do bolsonaro bol...,oliveira et al,2023,1,1


In [135]:
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
# Assuming 'aggressive' is your target variable

# Stratified sampling
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['hate'], random_state=42)

# Display the sizes of the resulting DataFrames
print(f"Original DataFrame size: {len(df)}")
print(f"Train DataFrame size: {len(train_df)}")
print(f"Test DataFrame size: {len(test_df)}")

Original DataFrame size: 43668
Train DataFrame size: 34934
Test DataFrame size: 8734


In [136]:
article_string = "Author: <a href=\"https://huggingface.co/ruanchaves\">Ruan Chaves Rodrigues</a>. Read more about our <a href=\"https://github.com/ruanchaves/eplm\">research on the evaluation of Portuguese language models</a>."

app_title = "Offensive Language Detection (Detecção de Linguagem Ofensiva)"

app_description = """
This app detects offensive language in Portuguese text using multiple models. You can either introduce your own sentences by filling in the "Text" field or click on one of the examples provided below.
(Este aplicativo detecta linguagem ofensiva em texto em português usando vários modelos. Introduza suas próprias frases preenchendo o campo "Text", ou clique em um dos exemplos fornecidos abaixo.)
"""

app_examples = [[text] for text in test_df['text'].tolist()]

In [137]:
model_list = [
    "ruanchaves/mdeberta-v3-base-hatebr",
    "ruanchaves/bert-base-portuguese-cased-hatebr",
    "ruanchaves/bert-large-portuguese-cased-hatebr",
]

user_friendly_name = {
    "ruanchaves/mdeberta-v3-base-hatebr": "mDeBERTa-v3 (HateBR)",
    "ruanchaves/bert-base-portuguese-cased-hatebr": "BERTimbau base (HateBR)",
    "ruanchaves/bert-large-portuguese-cased-hatebr": "BERTimbau large (HateBR)",
}

reverse_user_friendly_name = { v:k for k,v in user_friendly_name.items() }
user_friendly_name_list = list(user_friendly_name.values())
model_array = []

for model_name in model_list:
    row = {}
    row["name"] = model_name
    row["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
    row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name)
    model_array.append(row)
       
def predict(s1, chosen_model):
    # Find the chosen model in the model_array
    selected_model = next((item for item in model_array if item["name"] == chosen_model), None)

    if selected_model is None:
        raise ValueError(f"Model '{chosen_model}' not found in model_array.")

    tokenizer = selected_model["tokenizer"]
    model = selected_model["model"]
    model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")

    with torch.no_grad():
        output = model(**model_input)
        logits = output[0][0].detach().numpy()
        probabilities = softmax(logits).tolist()

        # Return class prediction (0 or 1) and the probability associated with class 1
        class_prediction = logits.argmax()
        probability_offensive = probabilities[1]
        return class_prediction, probability_offensive

In [138]:
#df_sample = df.head(50)

In [139]:
# Define a lambda function to apply predict to each row in 'text_column'
apply_predict = lambda x: pd.Series(predict(x, "ruanchaves/bert-large-portuguese-cased-hatebr"))

# Apply the lambda function to the entire 'text_column' and create new columns for results
test_df[['prediction', 'probability']] = test_df['text'].apply(apply_predict)

# Display the resulting DataFrame
test_df.head()

Unnamed: 0,source,id,text,researcher,year,aggressive,hate,prediction,probability
41795,twitter,-,Geraldo Alckmin vs Jair Bolsonaro A esquerda q...,fortuna et al,2019,0,0,1.0,0.999987
19473,twitter,-,vacila não buceta mas eu faço o mesmo só sempr...,leite et al,2020,1,0,1.0,0.999974
25328,twitter,-,um jovem passando vergonha na internet,leite et al,2020,0,0,1.0,0.999992
21857,twitter,-,eu com o chip da claro 馃憥馃従,leite et al,2020,0,0,0.0,0.031472
16749,instagram,-,o lixo veio a tona,vargas et al,2021,0,0,0.0,1.2e-05


In [140]:
# Save the metrics DataFrame to a CSV file
test_df.to_csv('BERTimbau_large_str_20.csv', index=False)