In [None]:
!pip install torch datasets transformers

In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
import csv
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data = []
current_sentence = ""
current_label = ""
preprocessed = []
punc = '''~!@#$%^&*()_+{}|:"<>?`-[]\;',./='''
with open('train.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row
    for row in csvreader:
      data.append(row)
for row in data:
  word = ''
  label = ''
  if(row):
    word = row[1]
    label = row[2]
  if word == '':
    preprocessed.append([current_sentence[:-1], current_label[:-1]])
    current_sentence = ""
    current_label = ""
  else:
    if(len(word)>1):
      for ele in word:
        if ele in punc:
          if(len(word)>1):
            word = word.replace(ele, "",1)
    current_sentence = current_sentence + word + " "
    current_label = current_label + label + " "
preprocessed.append([current_sentence[:-1], current_label[:-1]])

In [4]:
filename = "train1.csv"

# Open the file in write mode
with open(filename, "w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(["text", "labels"])

    # Write the data rows
    writer.writerows(preprocessed)


In [5]:
data = []
current_sentence = ""
current_label = ""
preprocessed = []
punc = '''~!@#$%^&*()_+{}|:"<>?`-[]\;',./='''
with open('validation.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row
    for row in csvreader:
      data.append(row)
for row in data:
  word = ''
  label = ''
  if(row):
    word = row[1]
    label = row[2]
  if word == '':
    preprocessed.append([current_sentence[:-1], current_label[:-1]])
    current_sentence = ""
    current_label = ""
  else:
    if(len(word)>1):
      for ele in word:
        if ele in punc:
          if(len(word)>1):
            word = word.replace(ele, "",1)
    current_sentence = current_sentence + word + " "
    current_label = current_label + label + " "
preprocessed.append([current_sentence[:-1], current_label[:-1]])

In [6]:
filename = "validation1.csv"

# Open the file in write mode
with open(filename, "w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(["text", "labels"])

    # Write the data rows
    writer.writerows(preprocessed)

In [7]:
train = pd.read_csv('train1.csv')
# train = pd.read_csv('ner.csv')
validation = pd.read_csv('validation1.csv')
labels = [i.split() for i in train['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'B-ORG', 'I-LOC', 'I-MISC', 'B-LOC', 'I-PER', 'I-ORG', 'O', 'B-MISC', 'B-PER'}


In [8]:
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# tokenizer = BertTokenizerFast.from_pretrained('Twitter/TwHIN-BERT-large')
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [10]:
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [11]:
df_train = train
df_val = validation

In [12]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', num_labels=len(unique_labels))
        # self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

In [14]:
def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=10, worker_init_fn=seed_worker, generator=g, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=10, worker_init_fn=seed_worker, generator=g, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, foreach = True, amsgrad = True)
    if use_cuda:
        model = model.cuda()

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')


In [None]:
model = BertModel()

In [None]:
# LEARNING_RATE = 0.005
# EPOCHS = 5
# BATCH_SIZE = 12
# model.train()
# train_loop(model, df_train, df_val)
LEARNING_RATE = 0.005
EPOCHS = 10
BATCH_SIZE = 12
model.train()
train_loop(model, df_train, df_val)

In [24]:
# torch.save(model.state_dict(), "/content/drive/My Drive/spanish_model_0.89322")

In [None]:
# model = BertModel()
# model.load_state_dict(torch.load("/content/drive/My Drive/spanish_model"))
# model.to(device)
# tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [19]:
sentence_length = []
count = 0
data = []
current_sentence = ""
preprocessed = []
label_all_tokens = False
punc = '''~!@#$%^&*()_+{}|:"<>?`-[]\;',./='''
with open('test_noans.csv', 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  next(csvreader)  # Skip the header row
  for row in csvreader:
    data.append(row)
for row in data:
  word = ''
  if(row):
    word = row[1]
  if word == '':
    preprocessed.append([current_sentence[:-1]])
    sentence_length.append(count)
    current_sentence = ""
    count = 0
  else:
    if(len(word)>1):
      for ele in word:
        if ele in punc:
          if(len(word)>1):
            word = word.replace(ele, "",1)
      # word = re.sub(r'[^\w\s]', '', word)
    current_sentence = current_sentence + word + " "
    count +=1
preprocessed.append([current_sentence[:-1]])
sentence_length.append(count)

In [20]:
def align_word_ids(texts):

    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx
    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()

    prediction_label = predictions
    # print(sentence)
    # print(prediction_label)
    return prediction_label

In [None]:
test_ans = []

j = 0
for i in preprocessed:
  test_ans = test_ans + evaluate_one_text(model, i)
  j += 1
len(test_ans)

In [22]:
data = []
j = 0
for i in test_ans:
  data.append([j,i])
  j += 1

In [23]:
filename = "test_ans.csv"

# Open the file in write mode
with open(filename, "w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(["id","label"])

    # Write the data rows
    writer.writerows(data)