In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Step 1: Install dependencies
!pip install transformers datasets evaluate huggingface_hub tqdm torch

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [3]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, RandomSampler
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AdamW
from datasets import load_dataset
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers.data.processors.squad import SquadProcessor
from transformers import squad_convert_examples_to_features
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import evaluate
import collections
import numpy as np

In [4]:
from datasets import load_from_disk

# Load DatasetDict from Google Drive
dataset_dict_path = '/kaggle/input/news-qa-data/news-qa-data'
datasets = load_from_disk(dataset_dict_path)

# Verify the content
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 57224
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 14335
    })
})


In [5]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [6]:
max_length = 512
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        # Periksa jika jawaban ada
        if len(answer["answer_start"]) == 0 or len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
# Mapping dataset tanpa caching
train_dataset = datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=datasets["train"].column_names,
    keep_in_memory=True
)

len(datasets["train"]), len(train_dataset)

Map:   0%|          | 0/57224 [00:00<?, ? examples/s]

(57224, 99693)

In [8]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [9]:
validation_dataset = datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=datasets["validation"].column_names,
    keep_in_memory=True 
)
len(datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/14335 [00:00<?, ? examples/s]

(14335, 24856)

In [17]:
def compute_metrics(start_logits, end_logits, features, examples):
    metric = evaluate.load("squad")
    n_best = 20  
    max_answer_length = 200  
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": str(example_id), "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": str(example_id), "prediction_text": ""})

    # Format references to match SQuAD evaluation
    theoretical_answers = [{"id": str(ex["id"]), "answers": ex["answers"]} for ex in examples]

    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [11]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-finetuned-newsqa-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)



In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,3.1523
1000,2.3773
1500,1.9477
2000,1.8343
2500,1.7501
3000,1.6611
3500,1.6507
4000,1.6442
4500,1.5899
5000,1.5589


TrainOutput(global_step=6231, training_loss=1.8400946151691768, metrics={'train_runtime': 3039.1506, 'train_samples_per_second': 32.803, 'train_steps_per_second': 2.05, 'total_flos': 1.3025199501490176e+16, 'train_loss': 1.8400946151691768, 'epoch': 1.0})

In [18]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, datasets["validation"])

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

  0%|          | 0/14335 [00:00<?, ?it/s]

{'exact_match': 41.48587373561214, 'f1': 55.77324510831456}

In [19]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, device='cuda')
context = """
AutoML: Automated Machine Learning tools like Auto-sklearn and H2O.ai simplify the process of building machine learning models.
"""
question = "What are some tools for automated machine learning?"
question_answerer(question=question, context=context)

{'score': 0.006397548597306013,
 'start': 47,
 'end': 68,
 'answer': 'Auto-sklearn and H2O.'}

In [20]:
from huggingface_hub import HfApi, HfFolder

api = HfApi()
token = "_______________"
HfFolder.save_token(token)

In [21]:
model.push_to_hub("distilbert-uncased-newsqa-squad")
tokenizer.push_to_hub("distilbert-uncased-newsqa-squad")

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Prasetyow12/distilbert-uncased-newsqa-squad/commit/51c31c6ad6c8cf01da3e1ece547f84b61fa7a233', commit_message='Upload tokenizer', commit_description='', oid='51c31c6ad6c8cf01da3e1ece547f84b61fa7a233', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
# Load model dan tokenizer dari Hugging Face Hub
model_distilbert_qa = AutoModelForQuestionAnswering.from_pretrained("Prasetyow12/distilbert-uncased-newsqa-squad")
tokenizer_distilbert_qa = AutoTokenizer.from_pretrained("Prasetyow12/distilbert-uncased-newsqa-squad")

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [23]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import re

# Load model and tokenizer from Hugging Face
#model = AutoModelForQuestionAnswering.from_pretrained("Prasetyow12/distilbert-uncased-newsqa-finetuned-squad")
#tokenizer = AutoTokenizer.from_pretrained("Prasetyow12/distilbert-uncased-newsqa-finetuned-squad")

# Buat pipeline untuk inference
question_answerer = pipeline("question-answering", model=model_distilbert_qa, tokenizer=tokenizer_distilbert_qa)

# Contoh konteks
context = """
(CNN) -- Police are investigating whether or what family issues might have prompted a California man to shoot six of his family members -- killing five -- before committing suicide. His wife was critically wounded. Authorities on Tuesday said Devan Kalathat, 42, shot his family Sunday night at his Santa Clara townhouse, killing two adults and three children. Kalathat killed his 11-year-old son, Akhil Dev; his 4-year-old daughter, Negha Dev; his 35-year-old brother-in-law Ashok Appu Poothemkandi; Poothemkandi's 25-year-old wife, Suchitra Sivaraman; and the couple's 11-month-old daughter, Ahana. Kalathat's 34-year-old wife, who was not identified, sustained multiple gunshot wounds and remains in critical condition, said Lt. Phil Cooke. "Family dynamics and personal relationships may have played a factor," Cooke told reporters Tuesday. He said Kalathat was employed as an engineer and nothing indicated he was facing "layoff or financial crisis." Investigators believe Kalathat used two .45-caliber semi-automatic pistols, both of which he owned. Cooke said Kalathat bought one of the pistols in February and the other nearly two weeks ago -- roughly the same time his wife's brother, Poothemkandi, arrived in California from India with Suchitra Sivaraman and Ahana. Cooke noted that Poothemkandi was an "educated professional" with plans to stay in the Bay Area to work on a project for a high-tech firm. Police were called after a neighbor noticed Kalathat's wounded wife outside the home around 8:30 p.m. (11:30 p.m. ET), Cooke said. When police arrived, other victims were found around the kitchen and dining room in what Cooke described as "a very gruesome scene." The family shooting comes just two months after a Los Angeles father who, after he and his wife were fired from their jobs, killed her and their five young children before turning the gun on himself.
"""
question = "What did police say was a factor?"

# Fungsi untuk cleaning teks
def clean_text(text):
    # Definisikan pola regex untuk berbagai cleaning
    url_pattern = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
    hashtag_pattern = re.compile(r'#\w+', re.IGNORECASE)
    double_space_pattern = re.compile(r'\s\s+')
    header_pattern = re.compile(r'^.*?--\s?', re.IGNORECASE)
    video_pattern = re.compile(r'VIDEO:.*?(?:\.\s|$)', re.IGNORECASE)

    # Hapus URL
    text = url_pattern.sub('', text)

    # Hapus hashtag
    text = hashtag_pattern.sub('', text)

    # Cek jika ada '--' dalam 40 karakter pertama
    if '--' in text[:40]:
        # Hapus header sebelum '--'
        text = header_pattern.sub('', text).strip()

    # Hapus frasa "VIDEO:" hingga titik
    text = video_pattern.sub('', text)

    # Hapus double space
    text = double_space_pattern.sub(' ', text)

    # Trim leading and trailing spaces
    text = text.strip()

    return text

# Gunakan clean_text untuk cleaning konteks
cleaned_context = clean_text(context)

# Lakukan inference menggunakan konteks yang sudah dicleaning
result = question_answerer(question=question, context=cleaned_context)

# Output hasil inference
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.3340967893600464, 'start': 744, 'end': 787, 'answer': '"Family dynamics and personal relationships'}
