In [8]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup, BertTokenizerFast


from torch.cuda.amp import GradScaler, autocast

In [9]:

class MultiLabelDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class MultiLabelClassifier(nn.Module):
    def __init__(self):
        super(MultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.ModuleList([nn.Linear(768, 9) for _ in range(6)])

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = [classifier(outputs.last_hidden_state[:, 0, :]) for classifier in self.classifier]
        return logits


In [10]:
# Load data
df = pd.read_csv('train.csv')

# Define grades
grades = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [8]:
df.shape

(3911, 8)

In [11]:
# Preprocess data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
encodings = tokenizer(df['full_text'].tolist(), truncation=True, padding=True)

# Convert labels to tensor and subtract 1 because the model predicts from 0 to 8 (9 classes)
labels = torch.tensor(df[grades].values) - 1

In [12]:
# Convert data into PyTorch dataset
dataset = MultiLabelDataset(encodings, labels)

# Split data into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Define model and move it to GPU
model = MultiLabelClassifier()
model.to('cuda')

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_dataset))

# Define loss function (CrossEntropyLoss expects the class indices)
loss_fn = nn.CrossEntropyLoss()


# Initialize gradient scaler for mixed precision training
scaler = GradScaler()



In [13]:
# Train model with early stopping after 5 epochs without improvement
best_loss = float('inf')
epochs_without_improvement = 0
n_epochs_stop = 5

In [None]:
# Initialize a new model with the same architecture
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6).to('cuda')
model = MultiLabelClassifier()


# Load the saved weights of the best trained model
model.load_state_dict(torch.load('best_model_v3.pt'))
model.to('cuda')

In [14]:


for epoch in range(100):  # maximum of 100 epochs
    model.train()
    n = 0
    for batch in DataLoader(train_dataset, batch_size=16, shuffle=True):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Compute loss and backpropagate
        loss = sum(loss_fn(output.float(), label.long()) for output, label in zip(outputs, labels.t()))
        loss_train = loss
        
        loss.backward()
        print(loss_train, end="\r")
        print(n, end="\r")
        
        optimizer.step()
        scheduler.step()
        n +=1
    # Evaluate model on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in DataLoader(test_dataset, batch_size=128):
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            val_loss += sum(loss_fn(output.float(), label.long()) for output, label in zip(outputs, labels.t()))
    print(f'Epoch {epoch}, Train Loss: {loss_train}, Val Loss: {val_loss}')
    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        epochs_without_improvement = 0
        
        # Save best model
        torch.save(model.state_dict(), 'best_model_v3.pt')
    else:
        epochs_without_improvement += 1
        
    if epochs_without_improvement >= n_epochs_stop:
        break

print("Training completed.")


# Освобождаем видеопамять после выполнения операций
torch.cuda.empty_cache()

  item['labels'] = torch.tensor(self.labels[idx])


_96sor(6.9120, device='cuda:0', grad_fn=<AddBackward0>))
Epoch 0, Train Loss: 6.912009239196777, Val Loss: 47.34738540649414
_92sor(4.7416, device='cuda:0', grad_fn=<AddBackward0>)
Epoch 1, Train Loss: 4.741606712341309, Val Loss: 39.83797073364258
_88sor(5.8756, device='cuda:0', grad_fn=<AddBackward0>)
Epoch 2, Train Loss: 5.875617980957031, Val Loss: 36.51759338378906
_84sor(5.0260, device='cuda:0', grad_fn=<AddBackward0>)
Epoch 3, Train Loss: 5.026007652282715, Val Loss: 35.900390625
_80sor(3.6304, device='cuda:0', grad_fn=<AddBackward0>)
Epoch 4, Train Loss: 3.6303796768188477, Val Loss: 36.120601654052734
1118or(4.4483, device='cuda:0', grad_fn=<AddBackward0>)

KeyboardInterrupt: 

In [15]:
# Освобождаем видеопамять после выполнения операций
torch.cuda.empty_cache()

In [16]:
# Load the test data
test_df = pd.read_csv('test.csv')
test_essays = test_df['full_text'].values

In [17]:
# Initialize a new model with the same architecture
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6).to('cuda')
model = MultiLabelClassifier()


# Load the saved weights of the best trained model
model.load_state_dict(torch.load('best_model_v3.pt'))
model.to('cuda')
model.eval()  # Set the model to evaluation mode

MultiLabelClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [18]:
class EssayDatasetTest(Dataset):
    def __init__(self, essays, targets, tokenizer, max_len):
        self.essays = essays
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, item):
        essay = str(self.essays[item])
        # target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            essay,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
            )
          # For test data, you can return None or a placeholder for targets
        return {
            'essay_text': essay,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            
        }

In [19]:
# Create a test dataset
test_dataset = EssayDatasetTest(test_essays, None, tokenizer, max_len=512)  # 'None' for targets

# Create a test data loader
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Assuming you have a multi-label classification task
        # You might need to adjust this part based on your specific task
        batch_predictions = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(batch_predictions)

# 'predictions' now contains the model's predictions for the test data

In [None]:
print(predictions[2])