In [None]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
train_data = pd.read_json('datosParsed3.json')
#display(train_data)
# Selecciona una fila, omite la siguiente, y así sucesivamente
train_data = train_data.iloc[::2]
print("\nDataFrame con la mitad de las filas seleccionadas:")
display(train_data)


DataFrame con la mitad de las filas seleccionadas:


Unnamed: 0,id,text,label
0,aaexyuw,\n571 Main Page\n\n\nComputer Science 571\nCON...,course
2,achmly,\n\n\nECE/CS 752 Spring 1996\n\n\n\n\nECE/CS 7...,course
4,ackfxrep,\n\nEECS401 Web Page for Fall '96\n\n\nWelcome...,course
6,advmiv,\n\nCS 545 - Introduction to Robotics\n\n\n\n ...,course
8,agdvsjkw,\n\n\nCS325 Page\n\n\n\n\n\n\n\nCS325 Artifici...,course
...,...,...,...
6614,zwmfqj,\n\n\n\n\n\n Home page of Ka Yee Yeung\n\n\n\n...,student
6616,zwpln,\nKris Kocan's Home Page\n\nMy Home Page\n\nPr...,student
6618,zxgxje,\n\nHomePage of Daqing Li\n\n\n\n\nWelcome to ...,student
6620,zyrphu,\n\n\n\nHOME \n\n\n\n\n \n \n \n\n\nMarla Bak...,student


In [None]:
# Inicializar el tokenizador BERT
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

stop_words = set(nltk.corpus.stopwords.words('english'))
train_data['text'] = train_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Extraer textos y etiquetas
train_texts = train_data['text'].tolist()
train_labels = train_data['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

# Tokenizar y convertir a tensores
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

# Mostrar información sobre las codificaciones
print(train_encodings.keys())
print(train_encodings['input_ids'].shape)
print(train_encodings['attention_mask'].shape)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([2649, 512])
torch.Size([2649, 512])


In [None]:
# Crear un conjunto de datos personalizado
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.label_mapping = {'course': 0, 'department': 1, 'faculty': 2, 'other': 3, 'project': 4, 'staff': 5, 'student': 6}

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        # Convierte la etiqueta a tipo numérico usando el mapeo
        item['labels'] = torch.tensor(self.label_mapping[self.labels[idx]])

        return item

# Crear conjuntos de datos y DataLoader
train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [None]:
# Inicializar el modelo BERT para clasificación de sentimientos
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment',
                                                      num_labels=len(train_data['label'].unique()),
                                                      ignore_mismatched_sizes=True)

# Configurar el dispositivo a GPU si está disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Inicializar el optimizador
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Establecer el modelo en modo de entrenamiento
model.train()

# Definir la función de pérdida (criterio)
criterion = torch.nn.CrossEntropyLoss()

# Número de épocas (ajusta según sea necesario)
num_epochs = 3

# Bucle de entrenamiento
for epoch in range(num_epochs):
    for batch in train_loader:
        # Transferir datos al dispositivo (GPU si está disponible)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Realizar la propagación hacia adelante
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Realizar la retropropagación y la actualización de parámetros
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
import pickle

# Guardar train_dataset
with open('bert3_train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
torch.save(model.state_dict(), 'bert3_model.pth')

In [None]:
# IGNORAR CELDA
# Load the model directly from the saved file
restored_model = BertForSequenceClassification()
restored_model.load_state_dict(torch.load('bert3_model.pth'))

# Move the model to the appropriate device
restored_model.to(device)

# Initialize the optimizer (make sure to use the same parameters as before)
optimizer = AdamW(restored_model.parameters(), lr=2e-5, eps=1e-8)

# Load the optimizer state dictionary
optimizer.load_state_dict(torch.load('bert3_optimizer.pth'))

TypeError: BertForSequenceClassification.__init__() missing 1 required positional argument: 'config'

In [None]:
# Cambiar el modelo a modo de evaluación
model.eval()

# Listas para almacenar las predicciones y etiquetas reales
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in train_loader:
        # Transferir datos al dispositivo (GPU si está disponible)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Realizar la propagación hacia adelante sin realizar la retropropagación
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Obtener las predicciones y las etiquetas reales
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = labels.cpu().numpy()

        # Almacenar las predicciones y las etiquetas reales
        all_preds.extend(preds)
        all_labels.extend(labels)

# Calcular y mostrar el informe de clasificación
print(classification_report(all_labels, all_preds))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.96      0.93      0.94       278
           1       0.84      1.00      0.91        57
           2       0.90      0.97      0.93       354
           3       0.97      0.88      0.92      1225
           4       0.58      0.96      0.72       161
           5       0.71      0.79      0.75        43
           6       0.96      0.92      0.94       531

    accuracy                           0.91      2649
   macro avg       0.84      0.92      0.87      2649
weighted avg       0.93      0.91      0.91      2649



In [None]:
test_data = pd.read_json('dataTestParsed3.json')

# Preprocesar el conjunto de datos de prueba de la misma manera que el conjunto de entrenamiento
test_data['text'] = test_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Extraer textos y etiquetas (si están disponibles en el conjunto de prueba)
test_texts = test_data['text']

# Tokenizar y convertir a tensores
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt').to(device)

In [None]:
# Cambiar el modelo a modo de evaluación
model.eval()

# Listas para almacenar las predicciones en el conjunto de prueba sin etiquetas
test_preds_without_labels = []

with torch.no_grad():
    for i in range(len(test_encodings['input_ids'])):
        # Transferir datos al dispositivo (GPU si está disponible)
        input_ids = test_encodings['input_ids'][i].unsqueeze(0).to(device)
        attention_mask = test_encodings['attention_mask'][i].unsqueeze(0).to(device)

        # Realizar la propagación hacia adelante sin realizar la retropropagación
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Obtener las predicciones
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        # Almacenar las predicciones
        test_preds_without_labels.extend(preds)

# Obtener los nombres de los archivos (id) del conjunto de prueba
test_ids = test_data['id']

In [None]:
# Mapear las predicciones a las etiquetas reales usando el diccionario inverso de la asignación de etiquetas
label_mapping = {train_dataset.label_mapping[label]: label for label in train_dataset.label_mapping}

# Crear un DataFrame con las predicciones
predictions_df = pd.DataFrame({
    'id': test_ids,
    'label': [label_mapping[pred] for pred in test_preds_without_labels]
})

# Mostrar el DataFrame con las predicciones
print(predictions_df)

nombre_archivo = 'ENXEBRE-Bert-Sentimental-mitad-datatrain-bs-2.csv'
predictions_df.to_csv(nombre_archivo, index=False)

           id    label
0     aaclkul  student
1     aagelci  project
2     aangjmn    other
3      aawnpc    other
4     abdjgiz  student
...       ...      ...
1654    zxmmn    other
1655   zxwkru    other
1656  zybimtt    other
1657  zypnixf  project
1658   zzszho  student

[1659 rows x 2 columns]
