# Задание 2

**Для выполнения данного задания необходимо построить более сложную модель для решения своей задачи.** На сегодняшний день, вне зависимости от конкретной постановки задачи в NLP, вероятнее всего, лучшее качество будет демонстрировать модель с трансформерной архитектурой, поэтому вам необходимо:

- <font color='green'>(status)</font> выбрать в HuggingFace Hub модель, подходящую для вашей задачи
- <font color='green'>(status)</font> дообучить модель на своих данных
- <font color='green'>(status)</font> замерить качество работы модели до и после обучения с помощью выбранной метрики 

# Main

In [41]:
import pandas as pd
import time
from IPython.display import clear_output
import datasets
from scipy.special import softmax
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score, classification_report 

## Data preparation

In [42]:
def df_preparation(csv_file_name):
    df = pd.read_csv(csv_file_name)

    df = df.set_axis(['id', 'entity', 'sentiment', 'text'], axis=1)
    df = df[df['sentiment'] != 'Irrelevant']
    df['label'] = df['sentiment'].map({'Positive': 2, 'Neutral': 1, 'Negative': 0, 'Irrelevant': -1})
    df['text'] = df['text'].astype(str)

    df.dropna(inplace=True)
    
    df = df.drop(columns=['id', 'entity', 'sentiment'])

    return df

In [43]:
df_valid = df_preparation('twitter_validation.csv')
df_train = df_preparation('twitter_training.csv')

dataset = datasets.DatasetDict({"train":datasets.Dataset.from_dict(df_train),"test":datasets.Dataset.from_dict(df_valid)})

In [44]:
# model_name = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

# model_name = f"cardiffnlp/twitter-roberta-base-sentiment"

model_name = f"elozano/tweet_sentiment_eval"


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/61691 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

# Metrics before training

In [45]:
dataset_running = dataset['test']
reference = []
prediction = []
count_full = dataset_running.shape[0]
start_time = time.time()
for count, ds in enumerate(dataset_running):
    encoded_input = tokenizer(ds['text'], return_tensors='pt')
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    prediction.append(ranking[0])
    reference.append(ds['label'])
    if count % (round(count_full*0.01)) == 0:
        clear_output(wait=True)
        print(f"{np.round(count / count_full * 100, 1)}% - {np.round(time.time() - start_time)} sec")

print(classification_report(reference, prediction, target_names=['Negative', 'Neutral', 'Positive']))

del reference, prediction

99.5% - 75.0 sec
              precision    recall  f1-score   support

    Negative       0.62      0.71      0.66       266
     Neutral       0.53      0.29      0.37       285
    Positive       0.61      0.82      0.70       277

    accuracy                           0.60       828
   macro avg       0.59      0.60      0.58       828
weighted avg       0.59      0.60      0.57       828



# Training

In [49]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(int(tokenized_datasets["train"].shape[0] * 0.02)))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(int(tokenized_datasets["test"].shape[0] * 1.0)))

metric = evaluate.load("accuracy")

save_directory = './pt_save_pretrained'

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='test_trainer', 
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics)



In [50]:
trainer.train()

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'eval_loss': 0.7827932834625244, 'eval_accuracy': 0.6859903381642513, 'eval_runtime': 1666.101, 'eval_samples_per_second': 0.497, 'eval_steps_per_second': 0.062, 'epoch': 1.0}


  0%|          | 0/104 [00:00<?, ?it/s]

{'eval_loss': 0.8389763832092285, 'eval_accuracy': 0.6956521739130435, 'eval_runtime': 1666.8506, 'eval_samples_per_second': 0.497, 'eval_steps_per_second': 0.062, 'epoch': 2.0}


  0%|          | 0/104 [00:00<?, ?it/s]

{'eval_loss': 0.919695258140564, 'eval_accuracy': 0.7125603864734299, 'eval_runtime': 1684.217, 'eval_samples_per_second': 0.492, 'eval_steps_per_second': 0.062, 'epoch': 3.0}
{'train_runtime': 20135.1046, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.023, 'train_loss': 0.6590914818548387, 'epoch': 3.0}


TrainOutput(global_step=465, training_loss=0.6590914818548387, metrics={'train_runtime': 20135.1046, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.023, 'total_flos': 973256532175872.0, 'train_loss': 0.6590914818548387, 'epoch': 3.0})

In [51]:
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
config.save_pretrained(save_directory)

# Metrics after training

In [52]:
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [53]:
dataset_running = dataset['test']
reference = []
prediction = []
count_full = dataset_running.shape[0]
start_time = time.time()
for count, ds in enumerate(dataset_running):
    encoded_input = tokenizer(ds['text'], return_tensors='pt')
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    prediction.append(ranking[0])
    reference.append(ds['label'])
    if count % (round(count_full*0.01)) == 0:
        clear_output(wait=True)
        print(f"{np.round(count / count_full * 100, 1)}% - {np.round(time.time() - start_time)} sec")

print(classification_report(reference, prediction, target_names=['Negative', 'Neutral', 'Positive']))

del reference, prediction

99.5% - 75.0 sec
              precision    recall  f1-score   support

    Negative       0.70      0.79      0.74       266
     Neutral       0.71      0.57      0.63       285
    Positive       0.72      0.78      0.75       277

    accuracy                           0.71       828
   macro avg       0.71      0.72      0.71       828
weighted avg       0.71      0.71      0.71       828

