In [1]:
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import Counter
from scipy.special import softmax
import pandas as pd
import numpy as np

datasets_folder = "../datasets"
file_name = "tupi_binary.csv"

In [2]:
# Construct the full path to the CSV file
csv_file_path = os.path.join(datasets_folder, file_name)

# Check if the file exists before attempting to read it
if os.path.isfile(csv_file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

In [3]:
article_string = "Author: <a href=\"https://huggingface.co/ruanchaves\">Ruan Chaves Rodrigues</a>. Read more about our <a href=\"https://github.com/ruanchaves/eplm\">research on the evaluation of Portuguese language models</a>."

app_title = "Offensive Language Detection (Detecção de Linguagem Ofensiva)"

app_description = """
This app detects offensive language in Portuguese text using multiple models. You can either introduce your own sentences by filling in the "Text" field or click on one of the examples provided below.
(Este aplicativo detecta linguagem ofensiva em texto em português usando vários modelos. Introduza suas próprias frases preenchendo o campo "Text", ou clique em um dos exemplos fornecidos abaixo.)
"""

app_examples = [[text] for text in df['text'].tolist()]

In [4]:
# output_textbox_component_description = """
# This box will display offensive language detection results based on the average score of multiple models.
# (Esta caixa exibirá resultados da detecção de linguagem ofensiva com base na pontuação média de vários modelos.)
# """

# output_json_component_description = { "breakdown": """
# This box presents a detailed breakdown of the evaluation for each model.
# """,
# "detalhamento": """
# (Esta caixa apresenta um detalhamento da avaliação para cada modelo.)
# """ }

# short_score_descriptions = {
#    0: "Not offensive",
#    1: "Offensive"
# }

# score_descriptions = {
#     0: "This text is not offensive.",
#     1: "This text is offensive.",
# }

# score_descriptions_pt = {
#     1: "(Este texto é ofensivo.)",
#     0: "(Este texto não é ofensivo.)",
# }

# model_list = [
#     "ruanchaves/mdeberta-v3-base-hatebr",
#     "ruanchaves/bert-base-portuguese-cased-hatebr",
#     "ruanchaves/bert-large-portuguese-cased-hatebr",
# ]

# user_friendly_name = {
#     "ruanchaves/mdeberta-v3-base-hatebr": "mDeBERTa-v3 (HateBR)",
#     "ruanchaves/bert-base-portuguese-cased-hatebr": "BERTimbau base (HateBR)",
#     "ruanchaves/bert-large-portuguese-cased-hatebr": "BERTimbau large (HateBR)",
# }

# reverse_user_friendly_name = { v:k for k,v in user_friendly_name.items() }

# user_friendly_name_list = list(user_friendly_name.values())

# model_array = []

# for model_name in model_list:
#     row = {}
#     row["name"] = model_name
#     row["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
#     row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name)
#     model_array.append(row)
 
# def most_frequent(array):
#     occurence_count = Counter(array)
#     return occurence_count.most_common(1)[0][0]


# def predict(s1, chosen_model):
#     if not chosen_model:
#         chosen_model = user_friendly_name_list[0]
#     scores = {}
#     full_chosen_model_name = reverse_user_friendly_name[chosen_model]
#     for row in model_array:
#         name = row["name"]
#         if name != full_chosen_model_name:
#             continue
#         else:
#             tokenizer = row["tokenizer"]
#             model = row["model"]
#             model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")
#             with torch.no_grad():
#                 output = model(**model_input)
#                 logits = output[0][0].detach().numpy()
#                 logits = softmax(logits).tolist()
#                 break

#     def get_description(idx):
#         description = score_descriptions[idx]
#         description_pt = score_descriptions_pt[idx]
#         final_description = description + "\n \n" + description_pt
#         return final_description

#     max_pos = logits.index(max(logits))
#     markdown_description = get_description(max_pos)
#     scores = {short_score_descriptions[k]: v for k, v in enumerate(logits)}

#     # Create a Pandas DataFrame for the classification results
#     results_df = pd.DataFrame(scores.items(), columns=['Class', 'Probability'])

#     # Convert the DataFrame to an HTML table
#     results_html = results_df.to_html(index=False)

#     # Compute binary classification metrics
#     true_labels = [1]  # Assuming binary classification, adjust as needed
#     predicted_labels = [int(max_pos)]  # Assuming binary classification, adjust as needed

#     accuracy = accuracy_score(true_labels, predicted_labels)
#     precision = precision_score(true_labels, predicted_labels)
#     recall = recall_score(true_labels, predicted_labels)
#     f1 = f1_score(true_labels, predicted_labels)

#     # Create a Pandas DataFrame for metrics
#     metrics_df = pd.DataFrame({
#         'Accuracy': [accuracy],
#         'Precision': [precision],
#         'Recall': [recall],
#         'F1 Score': [f1]
#     })

#     # Convert the metrics DataFrame to an HTML table
#     metrics_html = metrics_df.to_html(index=False)

#     return results_html, metrics_html, markdown_description

# inputs = [
#     gr.Textbox(label="Text", value=app_examples[0][0]),
#     gr.Dropdown(label="Model", choices=user_friendly_name_list, value=user_friendly_name_list[0])
# ]

# outputs = [
#     gr.HTML(label="Classification Results"),
#     gr.HTML(label="Binary Classification Metrics"),
#     gr.Markdown(),
# ]

# # Launch the Gradio app
# gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
#              description=app_description,
#              examples=app_examples,
#              article=article_string).launch()

In [5]:
model_list = [
    "ruanchaves/mdeberta-v3-base-hatebr",
    "ruanchaves/bert-base-portuguese-cased-hatebr",
    "ruanchaves/bert-large-portuguese-cased-hatebr",
]

user_friendly_name = {
    "ruanchaves/mdeberta-v3-base-hatebr": "mDeBERTa-v3 (HateBR)",
    "ruanchaves/bert-base-portuguese-cased-hatebr": "BERTimbau base (HateBR)",
    "ruanchaves/bert-large-portuguese-cased-hatebr": "BERTimbau large (HateBR)",
}

reverse_user_friendly_name = { v:k for k,v in user_friendly_name.items() }
user_friendly_name_list = list(user_friendly_name.values())
model_array = []

for model_name in model_list:
    row = {}
    row["name"] = model_name
    row["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
    row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name)
    model_array.append(row)
       
def predict(s1, chosen_model):
    tokenizer = model_array[0]["tokenizer"]
    model = model_array[0]["model"]
    model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")
    
    with torch.no_grad():
        output = model(**model_input)
        logits = output[0][0].detach().numpy()
        probabilities = softmax(logits).tolist()

        # Return class prediction (0 or 1) and the probability associated with class 1
        class_prediction = logits.argmax()
        probability_offensive = probabilities[1]
        return class_prediction, probability_offensive

In [6]:
df_teste = df.head()

In [7]:
# Define a lambda function to apply predict to each row in 'text_column'
apply_predict = lambda x: pd.Series(predict(x, "mDeBERTa-v3 (HateBR)"))

# Apply the lambda function to the entire 'text_column' and create new columns for results
df_teste[['prediction','probability']] = df_teste['text'].apply(apply_predict)

# Display the resulting DataFrame
df_teste.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[['prediction','probability']] = df_teste['text'].apply(apply_predict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[['prediction','probability']] = df_teste['text'].apply(apply_predict)


Unnamed: 0,source,id,text,researcher,year,aggressive,hate,prediction,probability
0,twitter,1.65848623693028e+18,@user @user @user quanto vc pagava na época da...,oliveira et al,2023,1,1,0.0,0.001359
1,twitter,1.65848623777333e+18,@user os árabes já vão lhes chutar do país ??,oliveira et al,2023,1,1,1.0,0.995865
2,twitter,1.65848960585394e+18,@user @user @user @user @user tem que desenhar...,oliveira et al,2023,1,1,1.0,0.999969
3,twitter,1.65849012716374e+18,@user @user chola mais gado. e se não quiser p...,oliveira et al,2023,1,1,1.0,0.999975
4,twitter,1.65849018793945e+18,michele micheque nao tinha cartao do bolsonaro...,oliveira et al,2023,1,1,1.0,0.999946
