In [3]:
!pip install optuna
!pip install datasets



In [4]:
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,pipeline,AdamW , get_scheduler
import torch
from accelerate import Accelerator
import tqdm
from datasets import load_dataset
import re
from torch.utils.data import DataLoader

In [5]:
ds = load_dataset("issai/kazqad", "kazqad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

(…)ing-comprehension-v1.0-kk-train.jsonl.gz:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

(…)omprehension-v1.0-kk-validation.jsonl.gz:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)ding-comprehension-v1.0-kk-test.jsonl.gz:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/764 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2713 [00:00<?, ? examples/s]

In [6]:
text_data = ['context', 'question']

In [7]:
df_train= ds['train'].to_pandas()
df_test= ds['test'].to_pandas()
df_val= ds['validation'].to_pandas()

In [8]:
def handle_dataframe(df):
    def remove_symbols(text):
        return re.sub(r'[-.,;()-/""*#;]', "", text)
    def remove_html(text):
        return re.sub(r"&lt.*?&gt", "", text)

    # Convert text to lowercase

    df_expanded = pd.concat([df, pd.json_normalize(df['answers'])], axis=1).drop(columns=['answers'])


    # Concatenate 'text' list items with commas and remove square brackets from 'answer_start'
    df_expanded['text'] = df_expanded['text'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df_expanded['answer_start'] = df_expanded['answer_start'].apply(lambda x: x[0] if isinstance(x, list) else x)

    # Extract the first item from the list if it exists, otherwise keep the value as is
    for column in ['text','answer_start']:
        df_expanded[column]= df_expanded[column].apply(lambda x : x[0])


    for column in text_data + ['text']:
        df_expanded[column]= df_expanded[column].apply(lambda x : x.lower())
        df_expanded[column]= df_expanded[column].apply(remove_symbols)
        df_expanded[column] = df_expanded[column].str.replace("====", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("===", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("==", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("&amp", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].apply(remove_html)
    return df_expanded

In [9]:
df_train_expanded= handle_dataframe(df_train)
df_test_expanded= handle_dataframe(df_test)
df_val_expanded= handle_dataframe(df_val)

In [10]:
from torch.utils.data import Dataset
class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_len):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize the question-context pair
        encoding = self.tokenizer(
            question,
            context,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Find the answer's start and end positions in the context
        answer_start_index = context.find(answer)
        answer_end_index = answer_start_index + len(answer)

        # Set start and end positions for the answer tokens
        start_positions = encoding.char_to_token(0, answer_start_index) if answer_start_index != -1 else 0
        end_positions = encoding.char_to_token(0, answer_end_index - 1) if answer_end_index != -1 else 0

        # If start or end positions are not found, set them to 0
        if start_positions is None:
            start_positions = 0
        if end_positions is None:
            end_positions = 0

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_positions, dtype=torch.long),
            'end_positions': torch.tensor(end_positions, dtype=torch.long),
        }

In [11]:
def compute_exact_match(prediction, ground_truth):
    return int(prediction.strip() == ground_truth.strip())

In [None]:
import optuna
from transformers import AdamW, get_scheduler

def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 5e-5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16])
    max_len = trial.suggest_int('max_len', 128, 256, step=64)
    scheduler_type = trial.suggest_categorical('scheduler_type', ['linear', 'cosine'])

    num_epochs = 3
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    repo_id = 'nur-dev/roberta-kaz-large'
    model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    train_dataset = QADataset(
      contexts=df_train_expanded['context'].tolist(),
      questions=df_train_expanded['question'].tolist(),
      answers=df_train_expanded['text'].tolist(),
      tokenizer=tokenizer,
      max_len=max_len
     )
    test_dataset = QADataset(
      contexts=df_val_expanded['context'].tolist(),
      questions=df_val_expanded['question'].tolist(),
      answers=df_val_expanded['text'].tolist(),
      tokenizer=tokenizer,
      max_len=max_len
    )
    val_dataset = QADataset(
      contexts=df_test_expanded['context'].tolist(),
      questions=df_test_expanded['question'].tolist(),
      answers=df_test_expanded['text'].tolist(),
      tokenizer=tokenizer,
      max_len=max_len
      )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(val_dataset, batch_size=batch_size)

    num_training_steps = num_epochs * len(train_loader)
    if scheduler_type == 'linear':
        scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    elif scheduler_type == 'cosine':
        scheduler = get_scheduler('cosine', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    model.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    model.eval()
    total_em = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            predicted_start = torch.argmax(start_logits, dim=1)
            predicted_end = torch.argmax(end_logits, dim=1)

            for i in range(input_ids.size(0)):
                input_id = input_ids[i]
                true_answer = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                predicted_answer = tokenizer.decode(input_id[predicted_start[i]:predicted_end[i]+1], skip_special_tokens=True)

                em = compute_exact_match(predicted_answer, true_answer)
                total_em += em
                total_samples += 1

    avg_em = total_em / total_samples
    return avg_em

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[I 2024-11-03 10:06:06,107] A new study created in memory with name: no-name-c84c03cf-8804-4cda-b4f0-a8f62097d46e
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 5e-5)


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at nur-dev/roberta-kaz-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]



In [None]:
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")