In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
df.head()

In [None]:
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForMultipleChoice, BertConfig

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
from transformers import DataCollatorWithPadding
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AdamW, get_linear_schedule_with_warmup
from typing import Optional, Union
import torch.nn as nn

In [None]:
#from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from dataclasses import dataclass

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
model_dir = '/kaggle/input/huggingface-bert/bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
#preprocesing the data 
def preprocess_data(df):
    tokenized_data = []

    for index, row in df.iterrows():
        question = row['prompt']
        options = {
            'A': row['A'],
            'B': row['B'],
            'C': row['C'],
            'D': row['D'],
            'E': row['E']
        }

        input_ids_list = []
        attention_mask_list = []

        for key, option in options.items():
            encoded_data = tokenizer(
                question,
                option,
                truncation=True,
                padding='max_length',
                max_length=128,
                return_tensors='pt',
            )

            input_ids_list.append(encoded_data['input_ids'])
            attention_mask_list.append(encoded_data['attention_mask'])

        label = ord(row['answer']) - ord('A')  # Преобразование буквы ответа в числовую метку (0-4)

        tokenized_example = {
            'input_ids': input_ids_list,
            'attention_mask': attention_mask_list,
            'label': label
        }

        tokenized_data.append(tokenized_example)

    tokenized_df = pd.DataFrame(tokenized_data)
    return tokenized_df

In [None]:
tokenized_df = preprocess_data(df)

In [None]:
class MultipleChoiceDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data.iloc[idx]['input_ids']
        attention_mask = self.data.iloc[idx]['attention_mask']
        label = self.data.iloc[idx]['label']
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

In [None]:
dataset = MultipleChoiceDataset(tokenized_df)

In [None]:
batch_size_train = 5
batch_size_val = 5

validation_ratio = 0.1

num_validation = int(validation_ratio * len(dataset))
num_train = len(dataset) - num_validation

train_dataset, val_dataset = random_split(dataset, [num_train, num_validation])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size_val, shuffle=False)

In [None]:
#loading the model
model = BertForMultipleChoice.from_pretrained(model_dir, num_labels=df['answer'].nunique())

In [None]:
# Define a list to store the even layers
#even_layers = []

# Specify the even layers you want to keep (e.g., every second layer)
#for i, layer in enumerate(original_bert_model.bert.encoder.layer):
#    if i % 2 == 0:
#        even_layers.append(layer)

# Create a new BERT model with even layers
#config = BertConfig.from_pretrained(model_dir)
#config.num_hidden_layers = len(even_layers)  # Update the number of layers
#model = BertForMultipleChoice(config=config)
#model.bert.encoder.layer = nn.ModuleList(even_layers)  # Set the even layers

In [None]:
# Define a list to store the odd layers
#odd_layers = []

# Specify the odd layers you want to keep (e.g., every second layer starting from the first layer)
#for i, layer in enumerate(original_bert_model.bert.encoder.layer):
#    if i % 2 == 1:
#        odd_layers.append(layer)

# Create a new BERT model with odd layers
#config = BertConfig.from_pretrained(model_dir)
#config.num_hidden_layers = len(odd_layers)  # Update the number of layers
#model = BertForMultipleChoice(config=config)
#model.bert.encoder.layer = nn.ModuleList(odd_layers)  # Set the odd layers

In [None]:
# Freeze the bottom layers (keep only the top layers for training)
num_layers_to_freeze = 6  
for param in model.bert.encoder.layer[:num_layers_to_freeze].parameters():
    param.requires_grad = False

In [None]:
# Moving model to GPU
model.to(device)

In [None]:
train_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    per_device_train_batch_size=batch_size_train,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_steps=10,
    logging_steps=10,
    #learning_rate=2e-5,
)

In [None]:
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),  
    lr=4e-5,
    weight_decay=0.01
)

# scheduling the learning rate
num_warmup_steps = 0
num_training_steps = len(train_dataloader) * train_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_dataloader,  
    eval_dataset=val_dataloader,    
)

In [None]:

best_val_accuracy = 0.0


for epoch in range(train_args.num_train_epochs):
    model.train()
    
    train_predictions = []
    train_labels = []


    for batch in train_dataloader:
        input_ids = torch.stack(batch['input_ids']).to(device)
        attention_mask = torch.stack(batch['attention_mask']).to(device)
        labels = torch.tensor(batch['labels']).to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)  # CrossEntropyLoss
        loss.backward()

        optimizer.step()
        scheduler.step()  

        train_predictions.extend(logits.argmax(dim=1).cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    # validating the model
    model.eval()
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = torch.stack(batch['input_ids']).to(device)
            attention_mask = torch.stack(batch['attention_mask']).to(device)
            labels = torch.tensor(batch['labels']).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            predictions = logits.argmax(dim=1).cpu().numpy()

            val_predictions.extend(predictions)
            val_labels.extend(labels.cpu().numpy())

    # Counting accuracy (though it is not the exact metric that is used in the competition)
    train_accuracy = accuracy_score(train_labels, train_predictions)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Epoch {epoch+1}/{train_args.num_train_epochs}: Train Accuracy = {train_accuracy:.4f}, Validation Accuracy = {val_accuracy:.4f}")

    # saving checkpoints
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        checkpoint_path = f"./best_checkpoint_epoch_{epoch+1}.pt"
        torch.save(model.state_dict(), checkpoint_path)
    model.train()

In [None]:
# Saving the tuned model
model.save_pretrained("./fine_tuned_model")

In [None]:
test = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test.head()

In [None]:
# Load the fine-tuned model

model = BertForMultipleChoice.from_pretrained("./fine_tuned_model")


#tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
def preprocess_test_data(df):
    tokenized_data = []

    for index, row in df.iterrows():
        question = row['prompt']
        options = {
            'A': row['A'],
            'B': row['B'],
            'C': row['C'],
            'D': row['D'],
            'E': row['E']
        }

        input_ids_list = []
        attention_mask_list = []

        for key, option in options.items():
            encoded_data = tokenizer(
                question,
                option,
                truncation=True,
                padding='max_length',
                max_length=128,
                return_tensors='pt',
            )

            input_ids_list.append(encoded_data['input_ids'])
            attention_mask_list.append(encoded_data['attention_mask'])

        tokenized_example = {
            'input_ids': input_ids_list,
            'attention_mask': attention_mask_list,
        }

        tokenized_data.append(tokenized_example)

    tokenized_df = pd.DataFrame(tokenized_data)
    return tokenized_df

tokenized_test = preprocess_test_data(test)

In [None]:
class TestDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data.iloc[idx]['input_ids']
        attention_mask = self.data.iloc[idx]['attention_mask']
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }

test_dataset = TestDataset(tokenized_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_val, shuffle=False)

In [None]:
from collections import defaultdict

In [None]:
# Create a dictionary to store the top-3 predictions for each id
test_predictions = defaultdict(list)

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = torch.stack(batch['input_ids'])
        attention_mask = torch.stack(batch['attention_mask'])

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        # Get the top-3 predictions for each example in the batch
        _, top3_indices = logits.topk(3, dim=1)
        top3_predictions = top3_indices.cpu().numpy()

        # Map predictions back to answer choices (A, B, C, D, E)
        answer_choices = ["A", "B", "C", "D", "E"]

        # Iterate through the batch and store top-3 predictions for each id
        for i in range(len(batch['input_ids'])):
            id = test['id'][len(test_predictions)]
            batch_predictions = [answer_choices[pred] for pred in top3_predictions[i]]
            test_predictions[id].append(" ".join(batch_predictions))

In [None]:
# Create a list of dictionaries for the final submission
submission_data = [{'id': id, 'prediction': " ".join(predictions)} for id, predictions in test_predictions.items()]

In [None]:
# Create a DataFrame from the list of dictionaries
submission_df = pd.DataFrame(submission_data)

In [None]:
submission_df

In [None]:
# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)