<a href="https://colab.research.google.com/github/ToniMarGar/Project-Transfer-Learning/blob/main/Mini_project_Transfer_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importación del Dataset

In [35]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [36]:
df1= pd.read_csv('/content/cleaned_dataset.csv')
df2= pd.read_csv('/content/cleaned_dataset2.csv')
df_concatenado = pd.concat([df1, df2], ignore_index=True)
print(df_concatenado)

                                             clean_tweet  Ofensivo  Vulgar
0      pues yo jalo trabaje en una ong y ahora en la ...         0       0
1      senores medios de prensa  ya dejen de entrevis...         1       0
2      jajajaja bueno a enrique le gusta provocar a l...         0       0
3      lo que mas me da gusto del tuit es la cantidad...         0       0
4      hace 111 anos nacio en el df uno de los mayore...         0       0
...                                                  ...       ...     ...
10771                         callate el hocicoooooooooo         0       0
10772                        callate ya si fue una calle         0       0
10773  wey ya callate tiene horas que murio el pleito...         0       1
10774                                      ay tu callate         0       0
10775  ayyyyyy por favor ya dejen lo de frida sofia l...         0       1

[10776 rows x 3 columns]


In [37]:
df_conteo = df_concatenado.groupby('Vulgar')['clean_tweet'].count().reset_index()
df_conteo

Unnamed: 0,Vulgar,clean_tweet
0,0,8204
1,1,2571


In [38]:
df_conteo2 = df_concatenado.groupby('Ofensivo')['clean_tweet'].count().reset_index()
df_conteo2

Unnamed: 0,Ofensivo,clean_tweet
0,0,7328
1,1,3447


In [39]:
filtered_df = df_concatenado[(df_concatenado['Vulgar'] != 0) & (df_concatenado['Ofensivo'] != 0)].count().reset_index()
filtered_df

Unnamed: 0,index,0
0,clean_tweet,1432
1,Ofensivo,1432
2,Vulgar,1432


In [40]:
df_concatenado['Vulgar'] = df_concatenado['Vulgar'].astype(bool)
df_concatenado['Ofensivo'] = df_concatenado['Ofensivo'].astype(bool)


In [41]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


tweets = df_concatenado['clean_tweet'].tolist()


class TweetDataset(Dataset):
    def init(self, tweets):
        self.tweets = tweets

    def len(self):
        return len(self.tweets)

    def getitem(self, idx):
        return self.tweets[idx]

df_concatenado['Vulgar'] = df_concatenado['Vulgar'].astype('bool')
df_concatenado['Ofensivo'] = df_concatenado['Ofensivo'].astype('bool')


## Carga del Modelo

In [50]:
df_concatenado['clean_tweet'] = df_concatenado['clean_tweet'].fillna("").astype(str)


In [51]:
df_concatenado

Unnamed: 0,clean_tweet,Ofensivo,Vulgar
0,pues yo jalo trabaje en una ong y ahora en la ...,False,False
1,senores medios de prensa ya dejen de entrevis...,True,False
2,jajajaja bueno a enrique le gusta provocar a l...,False,False
3,lo que mas me da gusto del tuit es la cantidad...,False,False
4,hace 111 anos nacio en el df uno de los mayore...,False,False
...,...,...,...
10771,callate el hocicoooooooooo,False,False
10772,callate ya si fue una calle,False,False
10773,wey ya callate tiene horas que murio el pleito...,False,True
10774,ay tu callate,False,False


In [43]:
pip install datasets



In [52]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from tqdm import tqdm

# ======== 1. Preparación del Dataset ========
# Dividir los datos en conjunto de entrenamiento y validación
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_concatenado['clean_tweet'],  # Textos
    df_concatenado[['Vulgar', 'Ofensivo']].values,  # Etiquetas multietiqueta como matriz numpy
    test_size=0.2,
    random_state=42
)

train_data = Dataset.from_dict({"clean_tweet": train_texts, "labels": train_labels.tolist()})
val_data = Dataset.from_dict({"clean_tweet": val_texts, "labels": val_labels.tolist()})

# ======== 2. Tokenización ========
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["clean_tweet"], padding="max_length", truncation=True, max_length=128)

# Tokenizar los datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Formato PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ======== 3. Crear DataLoaders ========
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
val_loader = DataLoader(val_data, batch_size=8)

# ======== 4. Modelo ========
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # Dos etiquetas: Vulgar y Ofensivo
)

# ======== 5. Configuración del Optimizer y Scheduler ========
# Primera fase: Congelar las primeras 4 capas
optimizer = AdamW([
    {"params": model.distilbert.transformer.layer[:4].parameters(), "lr": 1e-5},
    {"params": model.distilbert.transformer.layer[4:].parameters(), "lr": 5e-5},
    {"params": model.classifier.parameters(), "lr": 1e-4}
])

#optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ======== 6. Función de Entrenamiento ========
def train(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Entrenando"):
        # Mover datos al dispositivo
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].float().to(device)  # Convertir etiquetas a float para BCEWithLogitsLoss

        # Reiniciar gradientes
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Loss promedio: {avg_loss}")

for param in model.distilbert.transformer.layer[:4].parameters():
    param.requires_grad = False

# Entrenar por algunas épocas
train(model, train_loader, optimizer)

# Segunda fase: Descongelar 2 capas adicionales
for param in model.distilbert.transformer.layer[2:4].parameters():
    param.requires_grad = True

# Reducir la tasa de aprendizaje y continuar entrenando
for g in optimizer.param_groups:
    g['lr'] = g['lr'] / 2
train(model, train_loader, optimizer)

# ======== 7. Función de Evaluación ========
def evaluate(model, val_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluando"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].float().to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.sigmoid(logits) > 0.5  # Umbral para clasificar como 1 o 0

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Métricas
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average="samples"  # Métricas multietiqueta
    )
    acc = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {acc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")

# ======== 8. Entrenamiento ========
epochs = 3
for epoch in range(epochs):
    print(f"\n===== Época {epoch + 1} =====")
    train(model, train_loader, optimizer)
    evaluate(model, val_loader)

# ======== 9. Guardar el modelo ========
model.save_pretrained("./fine_tuned_distilbert")
tokenizer.save_pretrained("./fine_tuned_distilbert")



Map:   0%|          | 0/8620 [00:00<?, ? examples/s]

Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Entrenando: 100%|██████████| 1078/1078 [01:11<00:00, 15.03it/s]


Loss promedio: 0.48917308053025954


Entrenando: 100%|██████████| 1078/1078 [01:25<00:00, 12.58it/s]


Loss promedio: 0.3379396439801231

===== Época 1 =====


Entrenando: 100%|██████████| 1078/1078 [01:25<00:00, 12.61it/s]


Loss promedio: 0.26868573041263005


Evaluando: 100%|██████████| 270/270 [00:07<00:00, 34.92it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8000927643784786
Precision: 0.3186456400742115
Recall: 0.32258812615955473
F1: 0.316017316017316

===== Época 2 =====


Entrenando: 100%|██████████| 1078/1078 [01:25<00:00, 12.59it/s]


Loss promedio: 0.22090172795976248


Evaluando: 100%|██████████| 270/270 [00:07<00:00, 34.50it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8010204081632653
Precision: 0.3420686456400742
Recall: 0.34763450834879406
F1: 0.33967223252937534

===== Época 3 =====


Entrenando: 100%|██████████| 1078/1078 [01:25<00:00, 12.61it/s]


Loss promedio: 0.1796563175854192


Evaluando: 100%|██████████| 270/270 [00:07<00:00, 34.83it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.812152133580705
Precision: 0.35111317254174396
Recall: 0.35366419294990725
F1: 0.34786641929499074


('./fine_tuned_distilbert/tokenizer_config.json',
 './fine_tuned_distilbert/special_tokens_map.json',
 './fine_tuned_distilbert/vocab.txt',
 './fine_tuned_distilbert/added_tokens.json')

In [56]:
csv_file_path = '/content/datos_de_prueba.csv'

with open(csv_file_path, 'r') as file:
    for i in range(5):  # Mostrar las primeras 5 líneas
        print(file.readline())

tweet,clean_tweet

@vesteve3 @manubenas @ccoo_rm @desobediencia_ @ccoo @emparempar (Buen ánimo para esta primavera que iniciamos).,buen animo para esta primavera que iniciamos

"“@kirovast: @Hugo_Moran muy fan de la ""radicalidad social""” (Frente a la devaluación democrática).",muy fan de la radicalidad social frente a la devaluacion democratica

@ALTAS_PRESIONES Nuevos puta dueños para las jodido renovables. En ese momento ya no serán un problema sino una apuesta magnífica.,nuevos puta duenos para las jodido renovables en ese momento ya no seran un problema sino una apuesta magnifica

"@jumanjisolar @solartradex @josea_dolera El diferencial de precios energéticos con Alemania o Francia sigue siendo el mismo, 8-11 €/MWh.",el diferencial de precios energeticos con alemania o francia sigue siendo el mismo 811 EURmwh



In [60]:
import pandas as pd

df = pd.read_csv(csv_file_path)
df = df.drop(columns=['tweet'])

In [62]:
df= df.fillna("").astype(str)

In [64]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
import torch
import numpy as np


tweets = df['clean_tweet'].tolist()


class TweetDataset(Dataset):
    def init(self, tweets):
        self.tweets = tweets

    def len(self):
        return len(self.tweets)

    def getitem(self, idx):
        return self.tweets[idx]

# 1. Cargar el dataset en formato Hugging Face
new_data = Dataset.from_pandas(df)

# 2. Tokenización
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilbert")

def tokenize_function(examples):
    # Asegúrate de que "clean_tweet" sea una lista de cadenas
    return tokenizer(examples["clean_tweet"], padding="max_length", truncation=True, max_length=128)

# Usar map para aplicar la función de tokenización
new_data = new_data.map(tokenize_function, batched=True)

# Establecer el formato para PyTorch
new_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 3. Cargar el modelo
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilbert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 4. DataLoader
new_loader = DataLoader(new_data, batch_size=8, shuffle=False)

# 5. Hacer predicciones
model.eval()
all_predictions = []
with torch.no_grad():
    for batch in new_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
        all_predictions.extend(probs)

# 6. Interpretar
columns = ["Vulgar", "Ofensivo"]
predictions_df = pd.DataFrame(all_predictions, columns=columns)
print(predictions_df)


Map:   0%|          | 0/17519 [00:00<?, ? examples/s]

         Vulgar  Ofensivo
0      0.005561  0.005645
1      0.002767  0.016918
2      0.984155  0.207152
3      0.010588  0.006382
4      0.042360  0.005977
...         ...       ...
17514  0.018871  0.003880
17515  0.006264  0.166766
17516  0.016646  0.004261
17517  0.011253  0.011615
17518  0.004412  0.007102

[17519 rows x 2 columns]


In [68]:
predictions_df.to_csv('result.csv', index=False)


  predictions_df.to_csv(new_data, 'result.csv', index=False)


ValueError: Invalid file path or buffer object type: <class 'datasets.arrow_dataset.Dataset'>

In [72]:
# Combinar las entradas y las predicciones
data = {
    "Input": df['clean_tweet'],                # Textos de entrada
    "Prediction": all_predictions     # Predicciones del modelo
}

# Crear el DataFrame
predictions_df = pd.DataFrame(data)

# Mostrar el DataFrame
print(predictions_df)

# Exportar a un archivo CSV si es necesario
predictions_df.to_csv("predictions.csv", index=False)

                                                   Input  \
0           buen animo para esta primavera que iniciamos   
1      muy fan de la radicalidad social frente a la d...   
2      nuevos puta duenos para las jodido renovables ...   
3      el diferencial de precios energeticos con alem...   
4      por favor es importante difundir este mensaje ...   
...                                                  ...   
17514  que noticia tan triste el fallecimiento de cam...   
17515  bien dichopique soy espanol y deseo que gane n...   
17516  triste noticia el fallecimiento de camilo sest...   
17517  dedican su vida a ayudar a quienes lo necesita...   
17518  en el  reivindicamos la llegada del ave y una ...   

                        Prediction  
0       [0.00556115, 0.0056453394]  
1      [0.0027674614, 0.016918037]  
2          [0.9841553, 0.20715192]  
3      [0.010587908, 0.0063819466]  
4       [0.04235956, 0.0059767454]  
...                            ...  
17514    [0.01887124