# Training

from transformers import pipeline  
sentiment_task = pipeline("sentiment-analysis", model = "cardiffnlp/twitter-roberta-base-sentiment-latest")

In [1]:
import pandas as pd
import numpy as np
from datasets import (load_metric, Dataset, DatasetDict)
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer)
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_excel('./model3_1_tone/train_en.xlsx')
test = pd.read_excel('./model3_1_tone/test_en.xlsx')
train = train.rename(columns={'text_en_clean': 'text', 'text_tone': 'label'})
test = test.rename(columns={'text_en_clean': 'text', 'text_tone': 'label'})
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
dict_names = {'negative':0, 'positive':1, 'neutral':2}
train['label'] = train['label'].map(dict_names)
test['label'] = test['label'].map(dict_names)

In [3]:
# from google_trans_new import google_translator  
# translator = google_translator  ()
# train['text_en_clean'] = train['text_es_clean'].apply(translator.translate)
# test['text_en_clean'] = test['text_es_clean'].apply(translator.translate)
# train.to_excel('./model3_1_tone/train_en.xlsx')
# test.to_excel('./model3_1_tone/test_en.xlsx')

In [4]:
train = train[['text', 'label']]
test = test[['text', 'label']]

In [5]:
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
my_dataset_dict = DatasetDict({"train":train_dataset,"test":test_dataset})

In [6]:
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

100%|██████████| 2/2 [00:00<00:00,  6.13ba/s]
100%|██████████| 1/1 [00:00<00:00, 10.08ba/s]


In [8]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
training_args = TrainingArguments(output_dir="test_trainer")
metric = load_metric("accuracy")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-2021-124m were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2021-124m and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classi

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
for batch in trainer.get_train_dataloader():
    break

outputs = trainer.model.cpu()(**batch)

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.


In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: v.to(device) for k, v in batch.items()}

outputs = trainer.model.to(device)(**batch)

In [15]:
loss = outputs.loss
loss.backward()

In [16]:
trainer.create_optimizer()
trainer.optimizer.step()



In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1344
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 504
 33%|███▎      | 168/504 [1:02:56<2:05:41, 22.44s/it]The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 576
  Batch size = 8


RuntimeError: Numpy is not available

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-en")

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 807k/807k [00:00<00:00, 1.59MB/s]
Downloading: 100%|██████████| 783k/783k [00:00<00:00, 1.19MB/s] 
Downloading: 100%|██████████| 1.52M/1.52M [00:00<00:00, 2.27MB/s]
Downloading: 100%|██████████| 298M/298M [01:41<00:00, 3.06MB/s] 
  obj = cast(Storage, torch._UntypedStorage(nbytes))


In [20]:
import pandas as pd
prueba = pd.read_excel('./model3_1_tone/train.xlsx')

In [22]:
# text_src = ["pecuecae_loco Hasta que Uribe deje de ser congresista y uds lo siguen eligiendo  Eso podría responder unos venecos y yo no respondería",
#  "switch_oficial _robertman veneca mamasita asi me gustan grandes para que me den duro"]
text_src = list(prueba['text_es_clean'])
translated = model.generate(**tokenizer(text_src, return_tensors="pt", padding=True))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

KeyboardInterrupt: 

In [None]:
print(len(text_src))
print(tgt_text)

["Until Uribe stops being a congressman and you keep choosing him that could answer some venecos and I wouldn't answer.",
 '_robertman veneca mamasita so I like big so I can get hard']

# Predictions

In [1]:
from transformers import pipeline
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from alive_progress import alive_bar
import pandas as pd
from time import sleep

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
model_tunned = AutoModelForSequenceClassification.from_pretrained("test_trainer/checkpoint-500/", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
pipe = pipeline("text-classification", model=model_tunned, tokenizer=tokenizer)

pipe("veneco")

[{'label': 'LABEL_2', 'score': 0.5761494636535645}]

In [4]:
y_in = pd.read_csv('./../data/colombian_valid_tweets_predictions.csv')
print(len(y_in))

1485921


In [5]:
predictions = []

### Checking previous predictions ⚠

In [7]:
y_prev_predicted = pd.read_csv('./../data/colombian_valid_tweets_tone_predictions.csv')
print(len(y_prev_predicted))

1485007


In [None]:
#checking previous predictions
with alive_bar(len(y_prev_predicted), force_tty = True) as bar:
  print("Checking tweets already predicted")
  y_prev_predicted.reset_index(drop=True, inplace=True)
  for index,serie in y_prev_predicted.iterrows():
    if y_in.loc[index]['Id'] != y_prev_predicted.loc[index]['Id'] :
      y_prev_predicted.drop(range(index), inplace=True)
      y_prev_predicted.reset_index(drop=True, inplace=True)
      print("Error found on:",index, "deleted, please run again")
      break
    bar()


In [9]:
y_in.drop(range(len(y_prev_predicted)), inplace=True)
predictions = y_prev_predicted.to_dict('records')

### Predict ✔

In [10]:
with alive_bar(len(y_in), force_tty = True) as bar:
  print("Starting to predict")

  for index,serie in y_in.iterrows():
    try:
      tone_scale = pipe(serie['text'])[0]['label']
    except Exception as r:
      tone_scale = "ERROR"
    predictions.append({'Id':serie['Id'], 'text':serie['text'], 'date':serie['date'],
                        'referred_to':serie['referred_to'], 'tone_str':tone_scale})

    if bar.current() % 20000 == 0 and bar.current() != 0: #export every n tweets → checkpoint
      pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_tone_predictions.csv', index=False)
      print("Checkpoint saved, sleeping for 1 minute")
      sleep(60)# sleep for 60 seconds to avoid my pc exploding
      
    bar()
print("Finished the predictions")
pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_tone_predictions.csv', index=False)

on 0: Starting to predict                                                                                               
|████████████████████████████████████████| 914/914 [100%] in 53.4s (17.12/s)                                            
Finished the predictions on: None
