In [None]:
!pip install nltk
!pip install emoji
!pip install unidecode
!pip install xgboost
!pip install optuna



#Importando o Dataset de Treino

In [None]:
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/VictorHNascimento/ML-Olympiad-Toxic-Language-PTBR-Detection/main/toxic_language_train.csv'
!wget {url} -O dataset.csv
df = pd.read_csv('dataset.csv')
df.head()

--2024-05-30 21:52:59--  https://raw.githubusercontent.com/VictorHNascimento/ML-Olympiad-Toxic-Language-PTBR-Detection/main/toxic_language_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1573871 (1.5M) [text/plain]
Saving to: ‘dataset.csv’


2024-05-30 21:53:00 (7.41 MB/s) - ‘dataset.csv’ saved [1573871/1573871]



Unnamed: 0,text,label
0,"rt @user olha quem chegouuuuu, nossos queridin...",0
1,veio umas teorias muito loucas na minha cabeça...,1
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0
3,rt @user quer ser filha da puta logo comigo qu...,1
4,vai besta 😂😂😂😂 casquei com a ultima foto,1


In [None]:
df.shape[0]

16800

# Funções de Pre-Processamento

In [None]:
import re
import emoji
from unidecode import unidecode

def limpaTexto(text):
    # Remover emojis
    text = emoji.replace_emoji(text, replace='')  # Remove emojis

    # Remover menções de usuários
    text = re.sub(r'@\w+', ' ', text)

    # Remover links
    text = re.sub(r'https?://\S+', ' ', text)

    # Remover espaços extras
    text = re.sub(r'\s+', ' ', text)

    # Remover palavras específicas
    text = re.sub(r'\b(rt|user|https)\b', ' ', text, flags=re.IGNORECASE)

    # Remover acentuação
    text = unidecode(text)

    return text.strip()

def defineListas(df):
  #Criando Lista de Palavras
  toxic = set(''.join(df[df['label'] == 1]['text']).split())
  nontoxic = set(''.join(df[df['label'] == 0]['text']).split())
  return toxic, nontoxic

def contaToxicos(text, toxic_words):
  count = 0
  for p in text.split():
    if p in toxic_words:
      count += 1
  return count

def contaNaotoxicos(text, nontoxic_words):
  count = 0
  for p in text.split():
    if p in nontoxic_words:
      count += 1
  return count

def contaNeutros(text, toxic_words, nontoxic_words):
  words = text.split()
  count = 0
  for p in text.split():
    if p in nontoxic_words and p in toxic_words:
      count += 1
  return count

def contemToxicos(text, toxic_words):
  for p in text.split():
    if p in toxic_words:
      return 1
  return 0

def contemNaotoxicos(text, nontoxic_words):
  for p in text.split():
    if p in nontoxic_words:
      return 1
  return 0

def contemNeutros(text, toxic_words, nontoxic_words):
  for p in text.split():
     if p in nontoxic_words and p in toxic_words:
      return 1
  return 0

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#Obter as stopwords em PT-BR
stop_words = set(stopwords.words('portuguese'))

def removerStopwords(text):
  tokens = text.split()
  tokens_filtrados = [word for word in tokens if word.lower() not in stop_words]
  return ' '.join(tokens_filtrados)

df['text'] = df['text'].apply(removerStopwords)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatizaTexto(text):
  lemmatizer= WordNetLemmatizer()

  # Tokenizar o texto em palavras
  tokens = word_tokenize(text, language='portuguese')

  # Lematizar cada palavra
  lemmatized_tokens= [lemmatizer.lemmatize(word) for word in tokens]

  # Reunir as palavras lematizadas em uma string novamente
  return' '.join(lemmatized_tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#Pre Processando os Dados de Treino

In [None]:
# Aplicar a função de limpeza do texto
df['text'] = df['text'].apply(limpaTexto)

# Visualizar os tweets processados
df[['text']].head()

Unnamed: 0,text
0,"olha chegouuuuu, queridinhos!!! vem direcao fe..."
1,"veio umas teorias loucas cabeca agora, pqp to ..."
2,mais nao falado ontem ia patrocinado nada pude...
3,quer filha puta logo comigo 50x pior kkkkkkkkk...
4,vai besta casquei ultima foto


In [None]:
#Removendo Stop Words
df['text'] = df['text'].apply(removerStopwords)
df['text'].head()

0    olha chegouuuuu, queridinhos!!! vem direcao fe...
1    veio umas teorias loucas cabeca agora, pqp to ...
2    nao falado ontem ia patrocinado nada pudesse "...
3    quer filha puta logo comigo 50x pior kkkkkkkkk...
4                        vai besta casquei ultima foto
Name: text, dtype: object

In [None]:
# Lematizando o Texto
df['text'] = df['text'].apply(lemmatizaTexto)

In [None]:
# Criando um dicionario com a 'lista' de palavras toxicas e não toxicas
pal_toxicas = {}
pal_naotoxicas = {}

pal_toxicas, pal_naotoxicas = defineListas(df)

In [None]:
#Criando Novas Features
df['cont_toxic_words'] = df['text'].apply(lambda x: contaToxicos(x, pal_toxicas))
df['cont_nontoxic_words'] = df['text'].apply(lambda x: contaNaotoxicos(x, pal_naotoxicas))
df['cont_neutros'] = df['text'].apply(lambda x: contaNeutros(x, pal_toxicas, pal_naotoxicas))
df['toxic_words'] = df['text'].apply(lambda x: contemToxicos(x, pal_toxicas))
df['non_toxic_words'] = df['text'].apply(lambda x: contemNaotoxicos(x, pal_naotoxicas))
df['neutral_words'] = df['text'].apply(lambda x: contemNeutros(x, pal_toxicas, pal_naotoxicas))
df['char_cont'] = df['text'].apply(lambda x: len(x))
df['words_cont'] = df['text'].apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,text,label,cont_toxic_words,cont_nontoxic_words,cont_neutros,toxic_words,non_toxic_words,neutral_words,char_cont,words_cont
0,"olha chegouuuuu , queridinhos ! ! ! vem direca...",0,11,16,11,1,1,1,79,17
1,"veio uma teorias loucas cabeca agora , pqp to ...",1,9,8,7,1,1,1,55,10
2,nao falado ontem ia patrocinado nada pudesse `...,0,10,13,10,1,1,1,87,13
3,quer filha puta logo comigo 50x pior kkkkkkkkk...,1,9,8,8,1,1,1,66,10
4,vai besta casquei ultima foto,1,5,5,5,1,1,1,29,5


In [None]:
df.to_csv('toxic_language_tratado.csv', index= False)

#Funções de Transformação/Modelagem

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def TFIDF(df_tratado):

  df = df_tratado
  df = df.dropna() #Dropando Valores Nulos

  #Separando a coluna de Texto
  x_text = df.iloc[:, 0]
  x_text.head()

  #Separando as Features
  x_features = df.iloc[ :, 2:]
  x_features.head()

  #Vetorizando o texto
  vectorizer = TfidfVectorizer()
  x_text = vectorizer.fit_transform(x_text)

  x = hstack([x_text, x_features]) #Juntando texto vetorizado com as features
  y = df.iloc[:, 1] #label

  # Salvar o vetorizadorpara novas predições futuras
  joblib.dump(vectorizer, 'vectorizer.pkl')

  return x,y

import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def objetivo(trial, x, y):
    # Definir os hiperparâmetros a serem testados
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.10),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'gamma': trial.suggest_float('gamma', 0, 0.2)
    }

    # Treinar e avaliar o modelo usando validação cruzada
    classifier = XGBClassifier(**param_grid, use_label_encoder=False, eval_metric='mlogloss')
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

def defineHiperparametros(x, y):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objetivo(trial, x, y), n_trials=50)

    print("Melhores hiperparâmetros:")
    print(study.best_params)
    print("Melhor pontuação de acurácia:", study.best_value)

    return study.best_params

from xgboost import XGBClassifier
def treinaModelo(x_train, y_train, params):
  classifier = XGBClassifier(**params)
  modelo = classifier.fit(x_train, y_train)
  return modelo

#Modelagem

In [None]:
#Vetorizando o texto
df_train = pd.read_csv('toxic_language_tratado.csv')
x,y = TFIDF(df_train)

In [None]:
x.shape

(16795, 22578)

In [None]:
from sklearn.model_selection import train_test_split
# Dividindo o conjunto de dados em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# params = defineHiperparametros(x,y) - Escolhe os melhores Parametros
params = {'learning_rate': 0.08111785704552169, 'max_depth': 10, 'min_child_weight': 2, 'gamma': 0.19898942586155596} #Melhores Parametros Avaliados

In [None]:
model = treinaModelo(x_train, y_train, params)

#Avaliando o Modelo

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#Predizendo os resultados
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)

In [None]:
#Vendo o Classification Report
print("Classification Report do Treino:\n", classification_report(y_train, pred_train))
print()
print("Classification Report do Teste:\n", classification_report(y_test, pred_test))

Classification Report do Treino:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89      7525
           1       0.88      0.83      0.86      5911

    accuracy                           0.88     13436
   macro avg       0.88      0.87      0.87     13436
weighted avg       0.88      0.88      0.88     13436


Classification Report do Teste:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      1895
           1       0.84      0.79      0.82      1464

    accuracy                           0.85      3359
   macro avg       0.85      0.84      0.84      3359
weighted avg       0.85      0.85      0.85      3359



In [None]:
#Verificando Overfiting
accuracy_train = accuracy_score(y_train, pred_train)
print(f"Acurácia (Treinamento) com as previsões já realizadas: {accuracy_train*100:.2f}")

accuracy_test = accuracy_score(y_test, pred_test)
print(f'Acurácia (Teste) com as previsões já realizadas: {accuracy_test*100:.2f}')

percentage_difference= (accuracy_train-accuracy_test) * 100
print(f'Diferença Percentual: {percentage_difference:.2f}%')

Acurácia (Treinamento) com as previsões já realizadas: 87.70
Acurácia (Teste) com as previsões já realizadas: 84.61
Diferença Percentual: 3.09%


In [None]:
#Salvando o modelo para previsões futuras
joblib.dump(model, 'identifica_toxicos.pkl')

['identifica_toxicos.pkl']