In [1]:
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification #BertTokenizer, BertForSequenceClassification,
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

In [2]:
df_train = pd.read_csv('/kaggle/input/game-reviews/train.csv')
df_validation = pd.read_csv('/kaggle/input/game-reviews/validation.csv')
df_test = pd.read_csv('/kaggle/input/game-reviews/test.csv')

In [3]:
df_train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,460,Black Squad,2018.0,"Early Access ReviewVery great shooter, that ha...",1
1,2166,Tree of Savior (English Ver.),2016.0,I love love love playing this game!Super 100%!...,1
2,17242,Eternal Card Game,2016.0,Early Access ReviewAs a fan of MTG and Hearths...,1
3,6959,Tactical Monsters Rumble Arena,2018.0,Turn based strategy game similiar to FF Tactic...,1
4,8807,Yu-Gi-Oh! Duel Links,2017.0,This game has an insanely huge download for be...,0


In [None]:
# # Load pre-trained BERT model and tokenizer
# model_name = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name)

In [4]:
model_name = 'textattack/albert-base-v2-imdb'
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [6]:
# Set maximum sequence length
MAX_LEN = 128

In [7]:
# Create DataLoaders for train and val sets
train_dataset = CustomDataset(df_train['user_review'], df_train['user_suggestion'], tokenizer, MAX_LEN)
val_dataset = CustomDataset(df_validation['user_review'], df_validation['user_suggestion'], tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test['user_review'], df_test['user_suggestion'], tokenizer, MAX_LEN)

In [9]:
# Define training parameters
batch_size = 32
epochs = 30
lr = 2e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## **Accuracy of Bi-GRU**
* Training Accuracy: 86.12183252223528%
* Validation Accuracy: 82.51109370921431%
* Test Accuracy: 83.24197337509788%

In [11]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [12]:
device

device(type='cuda')

In [13]:
# Freeze all layers except the classification layer
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classification layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [14]:
def calculate_accuracy(model, loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return 100 * correct / total


In [16]:
from torch.nn.utils import clip_grad_norm_

best_val_loss = float('inf')  # Initialize best_val_loss to a very high value
best_epoch = -1  # Initialize best_epoch to an invalid value to track the epoch of the best validation loss

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    total_val_loss = 0

    # Training
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # Gradient clipping
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # Check if the current validation loss is the lowest; if so, save the model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Print the best epoch and its validation loss
print(f"The lowest validation loss was {best_val_loss:.4f} at epoch {best_epoch + 1}")

# Load the best model and calculate accuracy
model.load_state_dict(torch.load('best_model.pth'))
train_accuracy = calculate_accuracy(model, train_loader, device)
val_accuracy = calculate_accuracy(model, val_loader, device)

print(f'Best Model Training Accuracy: {train_accuracy:.2f}%')
print(f'Best Model Validation Accuracy: {val_accuracy:.2f}%')


Epoch 1/30, Training Loss: 0.4039, Validation Loss: 0.3866
Epoch 2/30, Training Loss: 0.3940, Validation Loss: 0.3803
Epoch 3/30, Training Loss: 0.3874, Validation Loss: 0.3760
Epoch 4/30, Training Loss: 0.3834, Validation Loss: 0.3736
Epoch 5/30, Training Loss: 0.3802, Validation Loss: 0.3709
Epoch 6/30, Training Loss: 0.3777, Validation Loss: 0.3684
Epoch 7/30, Training Loss: 0.3754, Validation Loss: 0.3670
Epoch 8/30, Training Loss: 0.3740, Validation Loss: 0.3650
Epoch 9/30, Training Loss: 0.3713, Validation Loss: 0.3631
Epoch 10/30, Training Loss: 0.3697, Validation Loss: 0.3618
Epoch 11/30, Training Loss: 0.3692, Validation Loss: 0.3608
Epoch 12/30, Training Loss: 0.3674, Validation Loss: 0.3591
Epoch 13/30, Training Loss: 0.3654, Validation Loss: 0.3583
Epoch 14/30, Training Loss: 0.3653, Validation Loss: 0.3589
Epoch 15/30, Training Loss: 0.3645, Validation Loss: 0.3564
Epoch 16/30, Training Loss: 0.3631, Validation Loss: 0.3564
Epoch 17/30, Training Loss: 0.3617, Validation Lo

  model.load_state_dict(torch.load('best_model.pth'))


Best Model Training Accuracy: 84.81%
Best Model Validation Accuracy: 84.03%


In [17]:
model.load_state_dict(torch.load('best_model.pth'))
test_accuracy = calculate_accuracy(model,test_loader, device)
print(f'Test Accuracy: {test_accuracy}%')

  model.load_state_dict(torch.load('best_model.pth'))


Test Accuracy: 85.25189245627773%
