In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,pipeline,AdamW , get_scheduler
from transformers import Trainer, TrainingArguments
import torch
from accelerate import Accelerator
import tqdm
from datasets import load_dataset
import re

In [4]:
ds = load_dataset("issai/kazqad", "kazqad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

(…)ing-comprehension-v1.0-kk-train.jsonl.gz:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

(…)omprehension-v1.0-kk-validation.jsonl.gz:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)ding-comprehension-v1.0-kk-test.jsonl.gz:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/764 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2713 [00:00<?, ? examples/s]

In [5]:
df_train= ds['train'].to_pandas()
df_test= ds['test'].to_pandas()
df_val= ds['validation'].to_pandas()

In [8]:
text_data = ['context', 'question']

In [9]:
def handle_dataframe(df):
    def remove_symbols(text):
        return re.sub(r'[-.,;()-/""*#;]', "", text)
    def remove_html(text):
        return re.sub(r"&lt.*?&gt", "", text)

    # Convert text to lowercase

    df_expanded = pd.concat([df, pd.json_normalize(df['answers'])], axis=1).drop(columns=['answers'])


    # Concatenate 'text' list items with commas and remove square brackets from 'answer_start'
    df_expanded['text'] = df_expanded['text'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df_expanded['answer_start'] = df_expanded['answer_start'].apply(lambda x: x[0] if isinstance(x, list) else x)

    # Extract the first item from the list if it exists, otherwise keep the value as is
    for column in ['text','answer_start']:
        df_expanded[column]= df_expanded[column].apply(lambda x : x[0])


    for column in text_data + ['text']:
        df_expanded[column]= df_expanded[column].apply(lambda x : x.lower())
        df_expanded[column]= df_expanded[column].apply(remove_symbols)
        df_expanded[column] = df_expanded[column].str.replace("====", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("===", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("==", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].str.replace("&amp", "", regex=False).str.strip()
        df_expanded[column] = df_expanded[column].apply(remove_html)
    return df_expanded

In [10]:
df_train_expanded= handle_dataframe(df_train)
df_test_expanded= handle_dataframe(df_test)
df_val_expanded= handle_dataframe(df_val)

In [11]:
from torch.utils.data import Dataset
class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_len):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize the question-context pair
        encoding = self.tokenizer(
            question,
            context,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Find the answer's start and end positions in the context
        answer_start_index = context.find(answer)
        answer_end_index = answer_start_index + len(answer)

        # Set start and end positions for the answer tokens
        start_positions = encoding.char_to_token(0, answer_start_index) if answer_start_index != -1 else 0
        end_positions = encoding.char_to_token(0, answer_end_index - 1) if answer_end_index != -1 else 0

        # If start or end positions are not found, set them to 0
        if start_positions is None:
            start_positions = 0
        if end_positions is None:
            end_positions = 0

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_positions, dtype=torch.long),
            'end_positions': torch.tensor(end_positions, dtype=torch.long),
        }


In [12]:
repo_id = 'nur-dev/roberta-kaz-large'
model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at nur-dev/roberta-kaz-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

In [13]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader


# Create datasets
train_dataset = QADataset(
    contexts=df_train_expanded['context'].tolist(),
    questions=df_train_expanded['question'].tolist(),
    answers=df_train_expanded['text'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

test_dataset = QADataset(
    contexts=df_val_expanded['context'].tolist(),
    questions=df_val_expanded['question'].tolist(),
    answers=df_val_expanded['text'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = QADataset(
    contexts=df_test_expanded['context'].tolist(),
    questions=df_test_expanded['question'].tolist(),
    answers=df_test_expanded['text'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

In [14]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch

# Set the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # Number of training epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [15]:
from sklearn.metrics import f1_score
from transformers import BertTokenizerFast
import torch
import numpy as np
from tqdm import tqdm

def compute_exact_match(prediction, ground_truth):
    return int(prediction.strip() == ground_truth.strip())

def compute_f1(prediction, ground_truth):
    pred_tokens = prediction.split()
    gt_tokens = ground_truth.split()
    common_tokens = set(pred_tokens) & set(gt_tokens)

    if len(common_tokens) == 0:
        return 0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(gt_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Training Loop
num_epochs = 3  # Number of epochs
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    total_loss = 0

    # Training phase
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"  Average Training Loss: {avg_train_loss:.4f}")

    # Evaluation phase (metrics calculation per epoch)
    model.eval()
    total_f1 = 0
    total_em = 0
    accurate_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Get model predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Get the highest-scoring start and end tokens
            predicted_start = torch.argmax(start_logits, dim=1)
            predicted_end = torch.argmax(end_logits, dim=1)

            # Calculate accuracy for start and end positions
            start_accuracy = (predicted_start == start_positions).sum().item()
            end_accuracy = (predicted_end == end_positions).sum().item()
            accurate_predictions += start_accuracy + end_accuracy

            # Convert token positions back to text for EM and F1
            for i in range(input_ids.size(0)):
                input_id = input_ids[i]
                context_text = tokenizer.decode(input_id, skip_special_tokens=True)
                true_answer = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                predicted_answer = tokenizer.decode(input_id[predicted_start[i]:predicted_end[i]+1], skip_special_tokens=True)

                # Compute EM and F1 for each answer
                em = compute_exact_match(predicted_answer, true_answer)
                f1 = compute_f1(predicted_answer, true_answer)
                total_em += em
                total_f1 += f1

                total_samples += 1

    # Calculate average metrics for the epoch
    accuracy = accurate_predictions / (2 * total_samples)  # Start and end positions both count towards accuracy
    exact_match = total_em / total_samples
    f1_score_avg = total_f1 / total_samples

    # Print epoch results
    print(f"  Epoch {epoch + 1} Metrics:")
    print(f"    Accuracy: {accuracy:.4f}")
    print(f"    Exact Match (EM): {exact_match:.4f}")
    print(f"    F1 Score: {f1_score_avg:.4f}")


Epoch 1/3


  0%|          | 0/198 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.eval()
total_f1 = 0
total_em = 0
accurate_predictions = 0
total_samples = 0

with torch.no_grad():
  for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

            # Get model predictions
    outputs = model(input_ids, attention_mask=attention_mask)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

            # Get the highest-scoring start and end tokens
    predicted_start = torch.argmax(start_logits, dim=1)
    predicted_end = torch.argmax(end_logits, dim=1)

            # Calculate accuracy for start and end positions
    start_accuracy = (predicted_start == start_positions).sum().item()
    end_accuracy = (predicted_end == end_positions).sum().item()
    accurate_predictions += start_accuracy + end_accuracy

            # Convert token positions back to text for EM and F1
    for i in range(input_ids.size(0)):
      input_id = input_ids[i]
      context_text = tokenizer.decode(input_id, skip_special_tokens=True)
      true_answer = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
      predicted_answer = tokenizer.decode(input_id[predicted_start[i]:predicted_end[i]+1], skip_special_tokens=True)

                # Compute EM and F1 for each answer
      em = compute_exact_match(predicted_answer, true_answer)
      f1 = compute_f1(predicted_answer, true_answer)
      total_em += em
      total_f1 += f1

      total_samples += 1

    # Calculate average metrics for the epoch
accuracy = accurate_predictions / (2 * total_samples)  # Start and end positions both count towards accuracy
exact_match = total_em / total_samples
f1_score_avg = total_f1 / total_samples

    # Print epoch results
print(f"    Accuracy: {accuracy:.4f}")
print(f"    Exact Match (EM): {exact_match:.4f}")
print(f"    F1 Score: {f1_score_avg:.4f}")