In [None]:
pip install torch



In [None]:
import os
import requests
import json
import torch

In [None]:
if not os.path.exists('arsitektur-qas-pariwisata-bali'):
    os.mkdir('arsitektur-qas-pariwisata-bali')

In [None]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

In [None]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'arsitektur-qas-pariwisata-bali/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

In [None]:
def read_squad(path, limit=None):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []

    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    # limit the data if the limit is specified
    if limit is not None:
        contexts = contexts[:int(len(contexts) * limit)]
        questions = questions[:int(len(questions) * limit)]
        answers = answers[:int(len(answers) * limit)]

    # return formatted data lists
    return contexts, questions, answers

In [None]:
# Set the limit to 50%
limit = 0.5
train_contexts, train_questions, train_answers = read_squad('arsitektur-qas-pariwisata-bali/train-v2.0.json', limit=limit)
val_contexts, val_questions, val_answers = read_squad('arsitektur-qas-pariwisata-bali/dev-v2.0.json', limit=limit)

In [None]:
# Print the number of records
print(f"Number of training records: {len(train_contexts)}")
print(f"Number of validation records: {len(val_contexts)}")

Number of training records: 65159
Number of validation records: 13116


In [None]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 4073/4073 [52:19<00:00,  1.30it/s, loss=0.668]
Epoch 1: 100%|██████████| 4073/4073 [52:20<00:00,  1.30it/s, loss=0.64]
Epoch 2: 100%|██████████| 4073/4073 [52:14<00:00,  1.30it/s, loss=0.357]


In [None]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

100%|██████████| 820/820 [03:42<00:00,  3.69it/s]


In [None]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	106	106
pred	114	108

true	106	106
pred	114	108

true	53	54
pred	59	65

true	50	54
pred	59	65

true	53	54
pred	59	65

true	37	40
pred	36	40

true	38	40
pred	36	40

true	38	40
pred	36	40

true	7	9
pred	7	9

true	59	59
pred	59	59

true	87	94
pred	59	60

true	110	116
pred	110	90



In [None]:
def calculate_exact_match(start_true, end_true, start_pred, end_pred):
    # Inisialisasi skor Exact Match
    exact_match_count = 0

    # Periksa setiap pasangan nilai start dan end
    for i in range(len(start_true)):
        # Periksa apakah pasangan nilai start dan end yang sebenarnya sama dengan hasil prediksi
        if start_true[i] == start_pred[i] and end_true[i] == end_pred[i]:
            exact_match_count += 1  # Jika sama, tambahkan skor Exact Match

    # Hitung persentase Exact Match
    total_samples = len(start_true)
    exact_match_percentage = (exact_match_count / total_samples) * 100

    return exact_match_percentage

# Hitung skor Exact Match
em_score = calculate_exact_match(start_true, end_true, start_pred, end_pred)
print(f'EM Score: {em_score:.2f}%')


EM Score: 16.67%


In [None]:
import pandas as pd

# Membaca file CSV yang berisi data konteks dan pertanyaan
df = pd.read_csv('tradisi_test.csv', encoding='latin1')  # Ganti 'nama_file.csv' dengan nama file CSV Anda

# Mengubah huruf menjadi huruf kecil untuk kolom 'Review'
question = 'question'
df[question] = df[question].str.lower()

# Mengubah huruf menjadi huruf kecil untuk kolom 'Judul'
context = 'context'
df[context] = df[context].str.lower()

# Mengubah huruf menjadi huruf kecil untuk kolom 'Deskripsi'
answer = 'answer'
df[answer] = df[answer].str.lower()

# Menampilkan DataFrame setelah perubahan
print(df)


                                             question  \
0                               perang pandan adalah?   
1            tradisi perang pandan berasal dari mana?   
2     perang pandan disebut juga dengan istilah apa ?   
3                                mekare-kare adalah ?   
4   tradisi perang pandan menjadi daya tarik bagi ...   
5          tujuan dari tradisi perang pandan adalah ?   
6    perang pandan merupakan bagian dari ritual apa ?   
7              apa ritual terbesar di desa tengenan ?   
8       masyarakat desa tengenan menganut agama apa ?   
9   apa yang masyarakat tengenan percayai tentang ...   
10            siapa yang memimpin desa tengenan dulu?   
11       maya denawa menganggap dirinya sebagai apa ?   
12  siapa yang melarang masyarakat tengenan melaku...   
13                 apa yang membuat para dewa murka ?   
14            siapa yang diutus melawan maya denawa ?   
15  siapa yang memenangkan peperangan peperangan  ...   
16                             

In [None]:
# Lakukan inference untuk setiap baris di dalam dataframe
predicted_answers = []

for index, row in df.iterrows():
    context = row['context']  # Ganti 'nama_kolom_konteks' dengan nama kolom yang berisi konteks
    question = row['question']  # Ganti 'nama_kolom_pertanyaan' dengan nama kolom yang berisi pertanyaan

    # Tokenisasi konteks dan pertanyaan
    tokenized_input = tokenizer(context, question, return_tensors="pt")

    # Lakukan inference
    with torch.no_grad():
        input_ids = tokenized_input['input_ids'].to(device)
        attention_mask = tokenized_input['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        # Temukan token dengan nilai start dan end tertinggi
        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)

        # Ambil jawaban dari input tokenized
        tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
        answer = ' '.join(tokens[start_index:end_index+1]).replace(' ##', '')

        # Decode jawaban yang diprediksi
        decoded_answer = tokenizer.decode(tokenized_input['input_ids'][0, start_index:end_index+1])

    predicted_answers.append(decoded_answer)

# Tambahkan hasil prediksi sebagai kolom baru di dataframe
df['Predicted_Answers'] = predicted_answers

# Tampilkan hasil
df

Unnamed: 0,question,context,answer,Predicted_Answers
0,perang pandan adalah?,perang pandan adalah tradisi untuk memperingat...,tradisi untuk memperingati kemenangan dewa ind...,tradisi untuk memperingati kemenangan dewa ind...
1,tradisi perang pandan berasal dari mana?,tradisi perang pandan berasal dari desa tengan...,"desa tenganan, kecamatan karangasem, bali","desa tenganan, kecamatan karangasem,"
2,perang pandan disebut juga dengan istilah apa ?,perang pandan disebut juga dengan istilah make...,makere - kere,makere - kere
3,mekare-kare adalah ?,mekare-kare adalah istilah dari perang pandan,istilah dari perang pandan,istilah dari perang pandan
4,tradisi perang pandan menjadi daya tarik bagi ...,tradisi perang pandan menjadi daya tarik bagi ...,wisatawan dalam negeri dan wisatawan asing,dalam negeri dan wisatawan
5,tujuan dari tradisi perang pandan adalah ?,tradisi perang pandan bertujuan untuk menghorm...,menghormati dewa indra atau dewa perang,bertujuan untuk menghormati dewa indra atau de...
6,perang pandan merupakan bagian dari ritual apa ?,perang pandan merupakan bagian dari ritual sas...,sasih sembah,sasih sembah
7,apa ritual terbesar di desa tengenan ?,ritual terbesar yang ada di desa tenganan adal...,sasih sembah,sasih sembah
8,masyarakat desa tengenan menganut agama apa ?,masyarakat di desa tenganan menganut agama hin...,hindu indra,hindu indra
9,apa yang masyarakat tengenan percayai tentang ...,masyarakat tenganan mempercayai bahwa desa yan...,desa yang mereka tempati merupakan hadian dari...,tempati merupakan hadian dari dewa indra


In [None]:
# Example context and question for inference
context = "masyarakat di desa tenganan menganut agama hindu indra. "
question = "masyarakat desa tengenan menganut agama apa  ?"

# Tokenize the context and question
tokenized_input = tokenizer(context, question, return_tensors="pt")

# Perform inference
with torch.no_grad():
    # Move inputs to device
    input_ids = tokenized_input['input_ids'].to(device)
    attention_mask = tokenized_input['attention_mask'].to(device)

    # Get model predictions
    outputs = model(input_ids, attention_mask=attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Find the tokens with the highest start and end scores
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Get the answer span from the tokenized input
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
    answer = ' '.join(tokens[start_index:end_index+1]).replace(' ##', '')

    # Decode the predicted answer
    decoded_answer = tokenizer.decode(tokenized_input['input_ids'][0, start_index:end_index+1])

print(f"Predicted Answer: {decoded_answer}")

Predicted Answer: hindu indra.


In [None]:
# Simpan hasil prediksi ke dalam file CSV baru
df.to_csv('predicted_answers.csv', index=False)  # Ganti 'predicted_answers.csv' dengan nama file yang Anda inginkan

In [None]:
from sklearn.metrics import accuracy_score

# Hitung Exact Match (EM)
exact_match = accuracy_score(df['answer'], df['Predicted_Answers'])

#print(f'Exact Match (EM) Score: {exact_match}')
print(f'EM Score: {exact_match:.2f}%')

# Hitung F1 Score
# Diperlukan preprocessing untuk menghitung precision dan recall
# Implementasikan menggunakan library seperti sklearn

EM Score: 0.32%


In [None]:
model_path = 'arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model/tokenizer_config.json',
 'arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model/special_tokens_map.json',
 'arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model/vocab.txt',
 'arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model/added_tokens.json',
 'arsitektur-qas-pariwisata-bali/distilbert-pariwisata-bali-model/tokenizer.json')