In [None]:
import io
import pandas as pd
from google.colab import files
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install pyarrow==14.0.2
!pip install datasets==2.11.0
!pip install transformers[torch]
!pip install accelerate -U




In [None]:
import torch
import transformers
import datasets
import accelerate

print(f"Torch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

Torch version: 2.3.0+cu121
Transformers version: 4.41.2
Datasets version: 2.11.0
Accelerate version: 0.31.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Cargar las reseñas de los medios especializados

In [None]:
ruta = "/content/drive/MyDrive/Colab Notebooks/TFG TLOU Reviews/"
critic_reviews = pd.read_csv(ruta+"critic_reviews_g2.csv")

In [None]:
critic_reviews.head()

Unnamed: 0,id,review,date,score
0,GameMAG,The Last of Us Part II is the high point of th...,2020-09-13,90.0
1,PLAY! Zine,Not everyone is going to enjoy the motives of ...,2020-09-05,93.0
2,,If you loved the original for its great charac...,2020-09-03,75.0
3,,"Prepare for anger, fear, pain and disappointme...",2020-08-10,100.0
4,,"Brave, brutal, dark. The Last of Us 2 is not f...",2020-07-22,80.0


In [None]:
critic_reviews = critic_reviews.drop(columns=['id', 'date'],axis=1)

In [None]:
critic_reviews.head()

Unnamed: 0,review,score
0,The Last of Us Part II is the high point of th...,90.0
1,Not everyone is going to enjoy the motives of ...,93.0
2,If you loved the original for its great charac...,75.0
3,"Prepare for anger, fear, pain and disappointme...",100.0
4,"Brave, brutal, dark. The Last of Us 2 is not f...",80.0


## Escalar los valores de las puntuaciones criticas

In [None]:
critic_reviews = critic_reviews.dropna(subset=['score'])

In [None]:
critic_reviews.head()

Unnamed: 0,review,score
0,The Last of Us Part II is the high point of th...,90.0
1,Not everyone is going to enjoy the motives of ...,93.0
2,If you loved the original for its great charac...,75.0
3,"Prepare for anger, fear, pain and disappointme...",100.0
4,"Brave, brutal, dark. The Last of Us 2 is not f...",80.0


In [None]:
critic_reviews['score'] = (critic_reviews['score'] / 10)
critic_reviews.head()

Unnamed: 0,review,score
0,The Last of Us Part II is the high point of th...,9.0
1,Not everyone is going to enjoy the motives of ...,9.3
2,If you loved the original for its great charac...,7.5
3,"Prepare for anger, fear, pain and disappointme...",10.0
4,"Brave, brutal, dark. The Last of Us 2 is not f...",8.0


In [None]:
critic_reviews['score'] = critic_reviews['score'].round().astype('int')

In [None]:
critic_reviews.head()

Unnamed: 0,review,score
0,The Last of Us Part II is the high point of th...,9
1,Not everyone is going to enjoy the motives of ...,9
2,If you loved the original for its great charac...,8
3,"Prepare for anger, fear, pain and disappointme...",10
4,"Brave, brutal, dark. The Last of Us 2 is not f...",8


In [None]:
critic_reviews.shape

(121, 2)

# Cargar todas las reseñas de los usuarios traducidas

In [None]:
user_reviews = pd.read_csv(ruta+"full_translated_g2_reviews.csv")

In [None]:
user_reviews.head()

Unnamed: 0,id,review,type_review,date,views,votes,score
0,Araset,"Well, it's most definetly sheeet and peess and...",expanded,2021-03-16,2,2,1
1,realistyalanci,"Pathetic.,Disappointment. Inconsistency.,Cring...",expanded,2021-03-16,3,3,0
2,wingZero21,I really enjoyed the first game. It was a 10/1...,normal,2021-03-14,2,2,6
3,echo360calix,This game is a very sad destruction of an amaz...,normal,2021-03-14,2,2,0
4,sa674,"It’s a different game from part one, not bad o...",expanded,2021-03-13,2,2,2


In [None]:
user_reviews = user_reviews.drop(columns=['id', 'type_review', 'date', 'views', 'votes'],axis=1)

In [None]:
user_reviews.head()

Unnamed: 0,review,score
0,"Well, it's most definetly sheeet and peess and...",1
1,"Pathetic.,Disappointment. Inconsistency.,Cring...",0
2,I really enjoyed the first game. It was a 10/1...,6
3,This game is a very sad destruction of an amaz...,0
4,"It’s a different game from part one, not bad o...",2


In [None]:
user_reviews.shape

(30605, 2)

#Unir user_reviews con critic_reviews

In [None]:
df = pd.concat([user_reviews, critic_reviews], axis=0)

In [None]:
df.head()

Unnamed: 0,review,score
0,"Well, it's most definetly sheeet and peess and...",1
1,"Pathetic.,Disappointment. Inconsistency.,Cring...",0
2,I really enjoyed the first game. It was a 10/1...,6
3,This game is a very sad destruction of an amaz...,0
4,"It’s a different game from part one, not bad o...",2


In [None]:
df.shape

(30726, 2)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import RobertaTokenizer

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Crear datasets HuggingFace a partir de los DataFrames de Pandas
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['review', 'score', '__index_level_0__'],
    num_rows: 24580
})
Dataset({
    features: ['review', 'score', '__index_level_0__'],
    num_rows: 6146
})


In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/24580 [00:00<?, ? examples/s]

Map:   0%|          | 0/6146 [00:00<?, ? examples/s]

In [None]:
print(train_dataset.column_names)

['review', 'score', '__index_level_0__', 'input_ids', 'attention_mask']


In [None]:
# Remover columnas innecesarias
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])

# Renombrar la columna 'score' a 'labels'
train_dataset = train_dataset.rename_column('score', 'labels')
test_dataset = test_dataset.rename_column('score', 'labels')

# Establecer el formato de los datos
train_dataset.set_format('torch')
test_dataset.set_format('torch')

In [None]:
print(train_dataset.column_names)

['review', 'labels', 'input_ids', 'attention_mask']


# Configurar RoBERTa

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='mse',
    fp16=True
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()