In [84]:
!pip install transformers datasets scikit-learn torch




In [85]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch


In [86]:
# Substitua pelo caminho real do seu dataset CSV
df = pd.read_csv("sentiment_tweets3.csv")
df.columns=['Indice','Texto','IndicadorDepressao']  # Esperado: colunas 'text' e 'label'
df = df.drop('Indice', axis=1)


In [87]:
# Exemplo de como balancear
from sklearn.utils import resample

# Separar as classes
df_maioria = df[df['IndicadorDepressao'] == 0]
df_minoria = df[df['IndicadorDepressao'] == 1]

# Fazer upsampling da classe minoritária
df_minoria_upsampled = resample(df_minoria, 
                               replace=True,
                               n_samples=len(df_maioria),
                               random_state=42)

# Combinar os datasets
df = pd.concat([df_maioria, df_minoria_upsampled])

In [88]:
import re
# Remover URLs
def remove_URL(text):
    return re.sub(r'http\S+|www\S' , '',text)

df['Texto'] = df['Texto'].apply(remove_URL)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga,0
2,@comeagainjen -,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,so sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn as someone already said, d...",0
7,23 or 24ï¿½C possible today. Nice,0
8,nite twitterville workout in the am -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams to you",0


In [89]:
#importa o dicionário de abreviações e gírias comuns de chat/mensagens
from chatWords import chat_words

#Substitui abreviações e gírias comuns de chat/mensagens por suas formas completas.
def chat_word(text):
    for a,b in chat_words.items():
        text=text.replace(a,b)
    return text

df['Texto']=df['Texto'].apply(chat_word)


In [90]:
from nltk.corpus import stopwords
#Remover stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Texto']=df['Texto'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real good moment. missssssssss much,",0
1,reading manga,0
2,@comeagainjen -,0
3,@lapcat Need send 'em accountant tomorrow. Odd...,0
4,ADD MYSPACE!!! myspace.com/LookThunder,0
5,sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn someone already said, fibe...",0
7,23 24ï¿½C possible today. Nice,0
8,nite twitterville workout -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams",0


In [91]:
import emoji
# Remover emojis
def remove_ej(text):
    return emoji.demojize(text)

df['Texto'] = df['Texto'].apply(remove_ej)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real good moment. missssssssss much,",0
1,reading manga,0
2,@comeagainjen -,0
3,@lapcat Need send 'em accountant tomorrow. Odd...,0
4,ADD MYSPACE!!! myspace.com/LookThunder,0
5,sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn someone already said, fibe...",0
7,23 24ï¿½C possible today. Nice,0
8,nite twitterville workout -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams",0


In [92]:
# Remove todos os @ do texto
df["Texto"] = df["Texto"].str.replace(r"[@#/.com]", "", regex=True)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real gd ent issssssssss uh,",0
1,reading anga,0
2,eagainjen -,0
3,"lapat Need send 'e auntant trrw Oddly, even re...",0
4,ADD MYSPACE!!! yspaeLkThunder,0
5,sleepy gd ties tnight thugh,0
6,"SilkChar re: nbn sene already said, fiber he e...",0
7,23 24ï¿½C pssible tday Nie,0
8,nite twitterville wrkut -ia,0
9,"daNanner Night, darlin'! Sweet dreas",0


In [93]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Texto'].tolist(), df['IndicadorDepressao'].tolist(), test_size=0.2, random_state=42
)

In [94]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [95]:
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


In [96]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
import os

os.makedirs("./logs", exist_ok=True)
os.makedirs("./meu_modelo", exist_ok=True) 

In [98]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./meu_modelo",         # saída do modelo
    logging_dir="./logs",              # pasta de logs
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=600,
    learning_rate=1e-5,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    class_weights=[1.0, 5.0],  # <-- IMPORTANTE: evita que tente usar TensorBoard se ele não estiver corretamente configurado
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'class_weights'

In [70]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # pega a classe com maior probabilidade
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()


In [None]:
trainer.evaluate()


In [None]:
model.save_pretrained("./meu_modelo_distilbert")
tokenizer.save_pretrained("./meu_modelo_distilbert")


In [None]:
from transformers import pipeline

# pipeline de classificação com modelo salvo
classificador = pipeline("text-classification", model="./meu_modelo_distilbert", tokenizer="./meu_modelo_distilbert")

# Exemplo de uso:
classificador("I want die")
