In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch


In [37]:
# Substitua pelo caminho real do seu dataset CSV
df = pd.read_csv("sentiment_tweets3.csv")
df.columns=['Indice','Texto','IndicadorDepressao']  # Esperado: colunas 'text' e 'label'
df = df.drop('Indice', axis=1)


In [38]:
import re
# Remover URLs
def remove_URL(text):
    return re.sub(r'http\S+|www\S' , '',text)

df['Texto'] = df['Texto'].apply(remove_URL)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga,0
2,@comeagainjen -,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,so sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn as someone already said, d...",0
7,23 or 24ï¿½C possible today. Nice,0
8,nite twitterville workout in the am -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams to you",0


In [39]:
#importa o dicionário de abreviações e gírias comuns de chat/mensagens
from chatWords import chat_words

#Substitui abreviações e gírias comuns de chat/mensagens por suas formas completas.
def chat_word(text):
    for a,b in chat_words.items():
        text=text.replace(a,b)
    return text

df['Texto']=df['Texto'].apply(chat_word)


In [40]:
from nltk.corpus import stopwords
#Remover stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Texto']=df['Texto'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real good moment. missssssssss much,",0
1,reading manga,0
2,@comeagainjen -,0
3,@lapcat Need send 'em accountant tomorrow. Odd...,0
4,ADD MYSPACE!!! myspace.com/LookThunder,0
5,sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn someone already said, fibe...",0
7,23 24ï¿½C possible today. Nice,0
8,nite twitterville workout -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams",0


In [41]:
import emoji
# Remover emojis
def remove_ej(text):
    return emoji.demojize(text)

df['Texto'] = df['Texto'].apply(remove_ej)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real good moment. missssssssss much,",0
1,reading manga,0
2,@comeagainjen -,0
3,@lapcat Need send 'em accountant tomorrow. Odd...,0
4,ADD MYSPACE!!! myspace.com/LookThunder,0
5,sleepy. good times tonight though,0
6,"@SilkCharm re: #nbn someone already said, fibe...",0
7,23 24ï¿½C possible today. Nice,0
8,nite twitterville workout -ciao,0
9,"@daNanner Night, darlin'! Sweet dreams",0


In [42]:
# Remove todos os @ do texto
df["Texto"] = df["Texto"].str.replace(r"[@#/.com]", "", regex=True)
df.head(10)

Unnamed: 0,Texto,IndicadorDepressao
0,"real gd ent issssssssss uh,",0
1,reading anga,0
2,eagainjen -,0
3,"lapat Need send 'e auntant trrw Oddly, even re...",0
4,ADD MYSPACE!!! yspaeLkThunder,0
5,sleepy gd ties tnight thugh,0
6,"SilkChar re: nbn sene already said, fiber he e...",0
7,23 24ï¿½C pssible tday Nie,0
8,nite twitterville wrkut -ia,0
9,"daNanner Night, darlin'! Sweet dreas",0


In [43]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Texto'].tolist(), df['IndicadorDepressao'].tolist(), test_size=0.2, random_state=42
)

In [44]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [45]:
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


In [46]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
import os

os.makedirs("./logs", exist_ok=True)
os.makedirs("./meu_modelo", exist_ok=True) 

In [48]:
from transformers import Trainer
import torch

class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Aplicar os pesos das classes
        if self.class_weights is not None:
            # Mover os pesos para o mesmo device do modelo
            class_weights = torch.tensor(self.class_weights).to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [49]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # pega a classe com maior probabilidade
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [51]:
# Calcular os pesos das classes
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Assumindo que train_labels são seus rótulos de treinamento
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
).astype(np.float32)

# Configurar os argumentos de treinamento normalmente
training_args = TrainingArguments(
    output_dir="./meu_modelo",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=600,
    learning_rate=1e-5,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# Usar o CustomTrainer com os pesos das classes
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights  # Passar os pesos aqui
)

# Iniciar o treinamento
trainer.train()


  0%|          | 0/516 [01:59<?, ?it/s]
                                                
  2%|▏         | 10/516 [00:37<31:12,  3.70s/it]

{'loss': 0.696, 'grad_norm': 0.9285956025123596, 'learning_rate': 1.6666666666666668e-07, 'epoch': 0.04}


                                                
  4%|▍         | 20/516 [01:15<32:01,  3.87s/it]

{'loss': 0.7003, 'grad_norm': 1.1790043115615845, 'learning_rate': 3.3333333333333335e-07, 'epoch': 0.08}


                                                
  6%|▌         | 30/516 [01:54<31:12,  3.85s/it]

{'loss': 0.6915, 'grad_norm': 1.6939252614974976, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.12}


                                                
  8%|▊         | 40/516 [02:33<31:06,  3.92s/it]

{'loss': 0.6879, 'grad_norm': 1.4140948057174683, 'learning_rate': 6.666666666666667e-07, 'epoch': 0.16}


                                                
 10%|▉         | 50/516 [03:13<31:24,  4.04s/it]

{'loss': 0.6904, 'grad_norm': 1.249079704284668, 'learning_rate': 8.333333333333333e-07, 'epoch': 0.19}


                                                
 12%|█▏        | 60/516 [03:52<29:55,  3.94s/it]

{'loss': 0.6818, 'grad_norm': 1.0928354263305664, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.23}


                                                
 14%|█▎        | 70/516 [04:31<29:21,  3.95s/it]

{'loss': 0.68, 'grad_norm': 1.9749432802200317, 'learning_rate': 1.1666666666666668e-06, 'epoch': 0.27}


                                                
 16%|█▌        | 80/516 [05:11<29:00,  3.99s/it]

{'loss': 0.6728, 'grad_norm': 1.2304162979125977, 'learning_rate': 1.3333333333333334e-06, 'epoch': 0.31}


                                                
 17%|█▋        | 90/516 [05:50<27:48,  3.92s/it]

{'loss': 0.6658, 'grad_norm': 1.322576642036438, 'learning_rate': 1.5e-06, 'epoch': 0.35}


                                                 
 19%|█▉        | 100/516 [06:28<26:00,  3.75s/it]

{'loss': 0.6521, 'grad_norm': 1.6024012565612793, 'learning_rate': 1.6666666666666667e-06, 'epoch': 0.39}


                                                 
 21%|██▏       | 110/516 [07:07<26:12,  3.87s/it]

{'loss': 0.6304, 'grad_norm': 1.75685715675354, 'learning_rate': 1.8333333333333333e-06, 'epoch': 0.43}


                                                 
 23%|██▎       | 120/516 [07:45<25:17,  3.83s/it]

{'loss': 0.5827, 'grad_norm': 2.2156336307525635, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.47}


                                                 
 25%|██▌       | 130/516 [08:24<24:42,  3.84s/it]

{'loss': 0.543, 'grad_norm': 2.146796226501465, 'learning_rate': 2.166666666666667e-06, 'epoch': 0.5}


                                                 
 27%|██▋       | 140/516 [09:02<24:09,  3.86s/it]

{'loss': 0.4775, 'grad_norm': 2.6948204040527344, 'learning_rate': 2.3333333333333336e-06, 'epoch': 0.54}


                                                 
 29%|██▉       | 150/516 [09:42<24:25,  4.00s/it]

{'loss': 0.4129, 'grad_norm': 2.5663273334503174, 'learning_rate': 2.5e-06, 'epoch': 0.58}


                                                 
 31%|███       | 160/516 [10:23<23:58,  4.04s/it]

{'loss': 0.3299, 'grad_norm': 1.7578293085098267, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.62}


                                                 
 33%|███▎      | 170/516 [11:02<22:16,  3.86s/it]

{'loss': 0.2589, 'grad_norm': 2.3430092334747314, 'learning_rate': 2.8333333333333335e-06, 'epoch': 0.66}


                                                 
 35%|███▍      | 180/516 [11:40<21:32,  3.85s/it]

{'loss': 0.1857, 'grad_norm': 1.4979413747787476, 'learning_rate': 3e-06, 'epoch': 0.7}


                                                 
 37%|███▋      | 190/516 [12:19<20:57,  3.86s/it]

{'loss': 0.132, 'grad_norm': 0.9906134605407715, 'learning_rate': 3.1666666666666667e-06, 'epoch': 0.74}


                                                 
 39%|███▉      | 200/516 [12:59<21:14,  4.03s/it]

{'loss': 0.0978, 'grad_norm': 0.8272995352745056, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.78}


                                                 
 41%|████      | 210/516 [13:39<20:00,  3.92s/it]

{'loss': 0.062, 'grad_norm': 0.5514804720878601, 'learning_rate': 3.5e-06, 'epoch': 0.81}


                                                 
 43%|████▎     | 220/516 [14:18<19:13,  3.90s/it]

{'loss': 0.0984, 'grad_norm': 0.43939051032066345, 'learning_rate': 3.6666666666666666e-06, 'epoch': 0.85}


                                                 
 45%|████▍     | 230/516 [14:57<18:36,  3.90s/it]

{'loss': 0.063, 'grad_norm': 0.4405587911605835, 'learning_rate': 3.833333333333334e-06, 'epoch': 0.89}


                                                 
 47%|████▋     | 240/516 [15:36<17:49,  3.87s/it]

{'loss': 0.0975, 'grad_norm': 0.32395482063293457, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.93}


                                                 
 48%|████▊     | 250/516 [16:15<17:18,  3.90s/it]

{'loss': 0.0614, 'grad_norm': 0.2602318227291107, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.97}


 50%|█████     | 258/516 [16:46<15:57,  3.71s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                    

 50%|█████     | 258/516 [18:08<15:57,  3.71s/it]
[A
[A

{'eval_loss': 0.061981040984392166, 'eval_accuracy': 0.9936984973339796, 'eval_precision': 0.997716894977169, 'eval_recall': 0.9732739420935412, 'eval_f1': 0.9853438556933484, 'eval_runtime': 82.0867, 'eval_samples_per_second': 25.132, 'eval_steps_per_second': 0.792, 'epoch': 1.0}


                                                   
 50%|█████     | 260/516 [18:17<1:30:50, 21.29s/it]

{'loss': 0.0859, 'grad_norm': 0.2798793613910675, 'learning_rate': 4.333333333333334e-06, 'epoch': 1.01}


                                                   
 52%|█████▏    | 270/516 [18:57<18:07,  4.42s/it]

{'loss': 0.0522, 'grad_norm': 0.2520969808101654, 'learning_rate': 4.5e-06, 'epoch': 1.05}


                                                 
 54%|█████▍    | 280/516 [19:35<15:14,  3.88s/it]

{'loss': 0.0343, 'grad_norm': 0.20952202379703522, 'learning_rate': 4.666666666666667e-06, 'epoch': 1.09}


                                                 
 56%|█████▌    | 290/516 [20:16<15:23,  4.09s/it]

{'loss': 0.0245, 'grad_norm': 0.19232112169265747, 'learning_rate': 4.833333333333333e-06, 'epoch': 1.12}


                                                 
 58%|█████▊    | 300/516 [20:55<13:55,  3.87s/it]

{'loss': 0.0263, 'grad_norm': 0.26510798931121826, 'learning_rate': 5e-06, 'epoch': 1.16}


                                                 
 60%|██████    | 310/516 [21:34<13:19,  3.88s/it]

{'loss': 0.1052, 'grad_norm': 3.4076545238494873, 'learning_rate': 5.1666666666666675e-06, 'epoch': 1.2}


                                                 
 62%|██████▏   | 320/516 [22:13<12:41,  3.89s/it]

{'loss': 0.0712, 'grad_norm': 0.1579582542181015, 'learning_rate': 5.333333333333334e-06, 'epoch': 1.24}


                                                 
 64%|██████▍   | 330/516 [22:51<11:59,  3.87s/it]

{'loss': 0.0587, 'grad_norm': 0.1565229296684265, 'learning_rate': 5.500000000000001e-06, 'epoch': 1.28}


                                                 
 66%|██████▌   | 340/516 [23:30<11:17,  3.85s/it]

{'loss': 0.0365, 'grad_norm': 0.13762985169887543, 'learning_rate': 5.666666666666667e-06, 'epoch': 1.32}


                                                 
 68%|██████▊   | 350/516 [24:09<10:42,  3.87s/it]

{'loss': 0.0148, 'grad_norm': 0.17830206453800201, 'learning_rate': 5.833333333333334e-06, 'epoch': 1.36}


                                                 
 70%|██████▉   | 360/516 [24:49<10:29,  4.04s/it]

{'loss': 0.023, 'grad_norm': 0.16296036541461945, 'learning_rate': 6e-06, 'epoch': 1.4}


                                                 
 72%|███████▏  | 370/516 [25:30<09:50,  4.04s/it]

{'loss': 0.0598, 'grad_norm': 2.6849329471588135, 'learning_rate': 6.166666666666667e-06, 'epoch': 1.43}


                                                 
 74%|███████▎  | 380/516 [26:11<09:34,  4.23s/it]

{'loss': 0.0332, 'grad_norm': 0.22871212661266327, 'learning_rate': 6.333333333333333e-06, 'epoch': 1.47}


                                                 
 76%|███████▌  | 390/516 [26:51<08:09,  3.88s/it]

{'loss': 0.0097, 'grad_norm': 0.14804527163505554, 'learning_rate': 6.5000000000000004e-06, 'epoch': 1.51}


                                                 
 78%|███████▊  | 400/516 [27:30<07:29,  3.87s/it]

{'loss': 0.0284, 'grad_norm': 0.17293603718280792, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.55}


                                                 
 79%|███████▉  | 410/516 [28:09<06:50,  3.88s/it]

{'loss': 0.0744, 'grad_norm': 11.429027557373047, 'learning_rate': 6.833333333333334e-06, 'epoch': 1.59}


                                                 
 81%|████████▏ | 420/516 [28:48<06:16,  3.92s/it]

{'loss': 0.0487, 'grad_norm': 0.09175246953964233, 'learning_rate': 7e-06, 'epoch': 1.63}


                                                 
 83%|████████▎ | 430/516 [29:27<05:35,  3.90s/it]

{'loss': 0.0072, 'grad_norm': 0.8362261056900024, 'learning_rate': 7.166666666666667e-06, 'epoch': 1.67}


                                                 
 85%|████████▌ | 440/516 [30:06<04:53,  3.86s/it]

{'loss': 0.0057, 'grad_norm': 0.08230803161859512, 'learning_rate': 7.333333333333333e-06, 'epoch': 1.71}


                                                 
 87%|████████▋ | 450/516 [30:46<04:25,  4.03s/it]

{'loss': 0.0054, 'grad_norm': 0.06776374578475952, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.74}


                                                 
 89%|████████▉ | 460/516 [31:26<03:46,  4.05s/it]

{'loss': 0.0047, 'grad_norm': 0.11350256204605103, 'learning_rate': 7.666666666666667e-06, 'epoch': 1.78}


                                                 
 91%|█████████ | 470/516 [32:05<02:58,  3.88s/it]

{'loss': 0.0382, 'grad_norm': 0.0592716820538044, 'learning_rate': 7.833333333333333e-06, 'epoch': 1.82}


                                                 
 93%|█████████▎| 480/516 [32:44<02:17,  3.83s/it]

{'loss': 0.0041, 'grad_norm': 0.11157822608947754, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.86}


                                                 
 95%|█████████▍| 490/516 [33:22<01:39,  3.84s/it]

{'loss': 0.0553, 'grad_norm': 0.06652989983558655, 'learning_rate': 8.166666666666668e-06, 'epoch': 1.9}


                                                 
 97%|█████████▋| 500/516 [34:01<01:02,  3.89s/it]

{'loss': 0.0067, 'grad_norm': 0.07307099550962448, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.94}


                                                 
 99%|█████████▉| 510/516 [34:40<00:23,  3.86s/it]

{'loss': 0.0044, 'grad_norm': 0.07677517831325531, 'learning_rate': 8.5e-06, 'epoch': 1.98}


100%|██████████| 516/516 [35:02<00:00,  3.68s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                         
100%|██████████| 516/516 [36:20<00:00,  3.68s/it]
[A
[A

{'eval_loss': 0.013572542928159237, 'eval_accuracy': 0.9975763451284537, 'eval_precision': 0.9955357142857143, 'eval_recall': 0.9933184855233853, 'eval_f1': 0.9944258639910813, 'eval_runtime': 78.0052, 'eval_samples_per_second': 26.447, 'eval_steps_per_second': 0.833, 'epoch': 2.0}


                                                 
100%|██████████| 516/516 [36:23<00:00,  4.23s/it]

{'train_runtime': 2183.0522, 'train_samples_per_second': 7.559, 'train_steps_per_second': 0.236, 'train_loss': 0.22816169376362316, 'epoch': 2.0}





TrainOutput(global_step=516, training_loss=0.22816169376362316, metrics={'train_runtime': 2183.0522, 'train_samples_per_second': 7.559, 'train_steps_per_second': 0.236, 'total_flos': 546494253155328.0, 'train_loss': 0.22816169376362316, 'epoch': 2.0})

In [52]:
trainer.evaluate()


100%|██████████| 65/65 [01:24<00:00,  1.30s/it]


{'eval_loss': 0.013572542928159237,
 'eval_accuracy': 0.9975763451284537,
 'eval_precision': 0.9955357142857143,
 'eval_recall': 0.9933184855233853,
 'eval_f1': 0.9944258639910813,
 'eval_runtime': 85.8929,
 'eval_samples_per_second': 24.018,
 'eval_steps_per_second': 0.757,
 'epoch': 2.0}

In [53]:
model.save_pretrained("./meu_modelo_distilbert")
tokenizer.save_pretrained("./meu_modelo_distilbert")


('./meu_modelo_distilbert\\tokenizer_config.json',
 './meu_modelo_distilbert\\special_tokens_map.json',
 './meu_modelo_distilbert\\vocab.txt',
 './meu_modelo_distilbert\\added_tokens.json',
 './meu_modelo_distilbert\\tokenizer.json')

In [56]:
from transformers import pipeline

# pipeline de classificação com modelo salvo
classificador = pipeline("text-classification", model="./meu_modelo_distilbert", tokenizer="./meu_modelo_distilbert")

# Exemplo de uso:
classificador("Major signs of depression. Seriously, I hope you are seeking help.")

[{'label': 'LABEL_1', 'score': 0.9482271075248718}]