In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
import transformers
from transformers import *
import numpy as np
# import evaluate
# from tqdm import tqdm

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda


In [3]:
from google.colab import drive
drive.mount('/content/drive')
trainPath = '/content/drive/MyDrive/train_data.csv'

Mounted at /content/drive


In [4]:
df = pd.read_csv(trainPath)

In [5]:
df.head()

Unnamed: 0,review,rating
0,location not palace excellent hotel booke dthe...,4
1,respite definitely not place stay looking ultr...,3
2,stunning truly memorable spot right beach nusa...,4
3,solid business hotel near embassy stayed hotel...,3
4,nice place make sure lock money warning money ...,3


In [6]:
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
unique_ratings = df['rating'].unique()
num_classes = len(unique_ratings)

num_classes

5

In [8]:
rating_counts = Counter(df['rating'])

classes = []
for rating, count in rating_counts.items():
    classes.append(count)
classes = torch.tensor(classes)
print(classes)
class_weights = 1.0 / classes
class_weights = class_weights / class_weights.sum()
print(class_weights)

tensor([7243, 4831, 1747, 1434, 1137])
tensor([0.0554, 0.0830, 0.2295, 0.2796, 0.3526])


In [10]:
# train_reviews = torch.tensor(train_dataset['review'].values)
# train_ratings = torch.tensor(train_dataset['rating'].values)
# val_reviews = torch.tensor(eval_dataset['review'].values)
# val_ratings = torch.tensor(eval_dataset['rating'].values)

In [31]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("MutazYoune/Absa_AspectSentiment_hotels")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForSequenceClassification.from_pretrained("MutazYoune/Absa_AspectSentiment_hotels")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--MutazYoune--Absa_AspectSentiment_hotels/snapshots/be75f8d59f178f496fde1f16e95e70444d246e41/config.json
Model config BertConfig {
  "_name_or_path": "MutazYoune/Absa_AspectSentiment_hotels",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "

In [32]:
num_ftrs = model.classifier.in_features

In [33]:
model.classifier = nn.Linear(num_ftrs, num_classes)

In [34]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [35]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['review'].tolist()
        self.labels = df['rating'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_length,
                                  return_tensors='pt')

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [36]:
train_dataset = MyDataset(df=train_df, tokenizer=tokenizer)
eval_dataset = MyDataset(df=eval_df, tokenizer=tokenizer)

In [37]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=32)

In [18]:
# tokenized = tokenizer(train_dataset['review'].tolist(), padding=True, truncation=True, return_tensors="pt")

In [38]:
optimizer = AdamW(model.parameters(), lr=2e-5)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [39]:
num_epochs = 5


optimizer = AdamW(model.parameters(), lr=2e-5)
model.to(device)
loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(device))

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fct(outputs.logits,labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Avg Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    eval_loss = 0
    num_eval_steps = 0
    correct_predictions = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fct(outputs.logits,labels)
            eval_loss += loss.item()
            num_eval_steps += 1

            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_predictions += torch.sum(predictions == labels).item()

    avg_eval_loss = eval_loss / num_eval_steps
    accuracy = correct_predictions / len(eval_dataset)
    print(f"Avg Eval Loss: {avg_eval_loss}")
    print(f"Accuracy: {accuracy}")

Epoch 0: 100%|██████████| 820/820 [02:14<00:00,  6.09it/s]


Avg Train Loss: 0.9026425372173147


Evaluating: 100%|██████████| 103/103 [00:12<00:00,  8.45it/s]


Avg Eval Loss: 0.8564262748921959
Accuracy: 0.5397987191216834


Epoch 1: 100%|██████████| 820/820 [02:14<00:00,  6.11it/s]


Avg Train Loss: 0.7787219546917008


Evaluating: 100%|██████████| 103/103 [00:12<00:00,  8.45it/s]


Avg Eval Loss: 0.7884163451426237
Accuracy: 0.5980481854223849


Epoch 2: 100%|██████████| 820/820 [02:14<00:00,  6.10it/s]


Avg Train Loss: 0.715353227388568


Evaluating: 100%|██████████| 103/103 [00:12<00:00,  8.50it/s]


Avg Eval Loss: 0.7814889040965478
Accuracy: 0.6029277218664227


Epoch 3: 100%|██████████| 820/820 [02:14<00:00,  6.11it/s]


Avg Train Loss: 0.6590407595038414


Evaluating: 100%|██████████| 103/103 [00:12<00:00,  8.48it/s]


Avg Eval Loss: 0.8303347878085757
Accuracy: 0.6087221713937176


Epoch 4: 100%|██████████| 820/820 [02:14<00:00,  6.10it/s]


Avg Train Loss: 0.5882120820988969


Evaluating: 100%|██████████| 103/103 [00:12<00:00,  8.43it/s]

Avg Eval Loss: 0.853489642004365
Accuracy: 0.6004879536444038





##GPT-2

In [58]:
from transformers import GPT2Tokenizer
from transformers import GPT2ForSequenceClassification


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=5)
model.to(device)


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=5, bias=False)
)

In [59]:
train_dataset = MyDataset(df=train_df, tokenizer=tokenizer)
eval_dataset = MyDataset(df=eval_df, tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

In [60]:
num_epochs = 3


optimizer = AdamW(model.parameters(), lr=2e-5)

loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(device))

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fct(outputs.logits,labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Avg Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    eval_loss = 0
    num_eval_steps = 0
    correct_predictions = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fct(outputs.logits,labels)
            eval_loss += loss.item()
            num_eval_steps += 1

            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_predictions += torch.sum(predictions == labels).item()

    avg_eval_loss = eval_loss / num_eval_steps
    accuracy = correct_predictions / len(eval_dataset)
    print(f"Avg Eval Loss: {avg_eval_loss}")
    print(f"Accuracy: {accuracy}")

Epoch 0: 100%|██████████| 13113/13113 [11:44<00:00, 18.61it/s]


Avg Train Loss: 0.9319411011099955


Evaluating: 100%|██████████| 3279/3279 [00:42<00:00, 76.54it/s]


Avg Eval Loss: 0.7893561969598407
Accuracy: 0.6532479414455626


Epoch 1: 100%|██████████| 13113/13113 [11:52<00:00, 18.40it/s]


Avg Train Loss: 0.7430655059431411


Evaluating: 100%|██████████| 3279/3279 [00:42<00:00, 77.30it/s]


Avg Eval Loss: 0.7926195949496666
Accuracy: 0.6495882891125343


Epoch 2: 100%|██████████| 13113/13113 [12:01<00:00, 18.18it/s]


Avg Train Loss: 0.6272046854735378


Evaluating: 100%|██████████| 3279/3279 [00:43<00:00, 75.38it/s]

Avg Eval Loss: 0.8225592181020672
Accuracy: 0.6349496797804208





In [56]:
import csv
def save_predictions_to_csv(model, dataloader, filename, device):
    model.eval()
    predictions = []
    labels = []

    # Iterate over batches in the dataloader
    for batch in tqdm(dataloader, desc="Generating Predictions"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Predictions
            batch_predictions = torch.argmax(logits, dim=-1)
            predictions.extend(batch_predictions.tolist())

    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for prediction in predictions:
            writer.writerow([prediction])


    print(f"Predictions saved to {filename}")

In [41]:
testPath = '/content/drive/MyDrive/test_data.csv'

In [61]:
df = pd.read_csv(testPath)
df.rename(columns={df.columns[0]: 'review'}, inplace=True)
df['rating'] = 0
print(df['review'])
test_dataset = MyDataset(df=df, tokenizer=tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=1)

0       n't return overall disappointed hotel, no hot ...
1       great value location desired problem hotel loc...
2       kind helpfull people people kind helpful.we no...
3       absolutely fabulous melia comfortable star hot...
4       trip hell thoughts gotten, trip airport van no...
                              ...                        
4093    cockroaches dirty carpeting not consider 10 de...
4094    ca n't wait return, husband stayed el san juan...
4095    coming home stay wind chimes inn like coming h...
4096    good hotel great location stayed apsis splendi...
4097    major ripoff, ripoff, place dump capital d. ar...
Name: review, Length: 4098, dtype: object


In [63]:
save_predictions_to_csv(model, test_dataloader, 'gpt.csv', device)

Generating Predictions: 100%|██████████| 4098/4098 [00:54<00:00, 75.86it/s]

Predictions saved to gpt.csv



