In [89]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, pipeline, \
                         DataCollatorWithPadding, TrainingArguments, Trainer, \
                         AutoModelForQuestionAnswering, AutoTokenizer

from datasets import Dataset
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import requests

In [90]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', return_token_type_ids = True)
qa_bert = BertForQuestionAnswering.from_pretrained('bert-large-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
df = pd.read_parquet('data/train-00000-of-00001 (1).parquet')
df.to_csv('qa.csv')

In [104]:
df = pd.read_csv('data/qa.csv')

In [105]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,context,question,answers,metadata
0,0,7ba1e8f4261d3170fcf42e84a81dd749116fae95,Brain,Another approach to brain function is to exami...,What sare the benifts of the blood brain barrir?,{'text': array(['isolated from the bloodstream...,"{'split': 'train', 'model_in_the_loop': 'Combi..."
1,1,5ec5ef305a259311596e85d811ade30bd68b079d,Brain,Another approach to brain function is to exami...,What is surrounded by cerebrospinal fluid?,"{'text': array(['brain'], dtype=object), 'answ...","{'split': 'train', 'model_in_the_loop': 'Combi..."
2,2,7cb230edfb15ad1fda8d157af1f2b574cbb02b4c,Brain,Another approach to brain function is to exami...,What does the skull protect?,"{'text': array(['brain'], dtype=object), 'answ...","{'split': 'train', 'model_in_the_loop': 'Combi..."
3,3,e1850f2a48b8f7c2231cec41ed63c1b638a8e2c7,Brain,Another approach to brain function is to exami...,What has been injected into rats to produce pr...,"{'text': array(['chemicals'], dtype=object), '...","{'split': 'train', 'model_in_the_loop': 'Combi..."
4,4,7bc0ae1a8a24ea4f3398b5236ab9569bbc3e820b,Brain,Another approach to brain function is to exami...,What can cause issues with how the brain works?,"{'text': array(['brain damage'], dtype=object)...","{'split': 'train', 'model_in_the_loop': 'Combi..."


In [106]:
df.shape

(30000, 7)

In [107]:
df.iloc[0]

Unnamed: 0                                                    0
id                     7ba1e8f4261d3170fcf42e84a81dd749116fae95
title                                                     Brain
context       Another approach to brain function is to exami...
question       What sare the benifts of the blood brain barrir?
answers       {'text': array(['isolated from the bloodstream...
metadata      {'split': 'train', 'model_in_the_loop': 'Combi...
Name: 0, dtype: object

In [59]:
df.iloc[0].context.find("isolated from the bloodstream")

195

In [60]:
df.iloc[0].answers[17:]

"isolated from the bloodstream'], dtype=object), 'answer_start': array([195])}"

In [61]:
df.iloc[0].answers.find(']')

47

In [62]:
df.iloc[0].answers[17:df.iloc[0].answers.find("]")-1]

'isolated from the bloodstream'

In [96]:
qa_df = df[['question','context']]

In [97]:
qa_df

Unnamed: 0,question,context
0,What sare the benifts of the blood brain barrir?,Another approach to brain function is to exami...
1,What is surrounded by cerebrospinal fluid?,Another approach to brain function is to exami...
2,What does the skull protect?,Another approach to brain function is to exami...
3,What has been injected into rats to produce pr...,Another approach to brain function is to exami...
4,What can cause issues with how the brain works?,Another approach to brain function is to exami...
...,...,...
29995,What is the lowest ISO mentioned?,"Some high-speed black-and-white films, such as..."
29996,What is the highest ISO mentioned?,"Some high-speed black-and-white films, such as..."
29997,What is Kodak'sbrand name of E6 film?,"Some high-speed black-and-white films, such as..."
29998,How do these films differ?,"Some high-speed black-and-white films, such as..."


In [108]:
answers = []
for index, row in df.iterrows():
    answer = row['answers'][17:row['answers'].find("]")-1]
    answers.append(answer)
qa_df['answers'] = answers

In [110]:
qa_df.head()

Unnamed: 0,question,context,answers
0,What sare the benifts of the blood brain barrir?,Another approach to brain function is to exami...,isolated from the bloodstream
1,What is surrounded by cerebrospinal fluid?,Another approach to brain function is to exami...,brain
2,What does the skull protect?,Another approach to brain function is to exami...,brain
3,What has been injected into rats to produce pr...,Another approach to brain function is to exami...,chemicals
4,What can cause issues with how the brain works?,Another approach to brain function is to exami...,brain damage


In [111]:
qa_dataset = Dataset.from_pandas(qa_df.sample(1000, random_state = 42))
qa_dataset = qa_dataset.train_test_split(test_size = 0.2)

In [112]:
def preprocess(data):
    return bert_tokenizer(data['question'], data['context'], truncation = True)

In [113]:
qa_dataset = qa_dataset.map(preprocess, batched = True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [114]:
for name, param in qa_bert.bert.named_parameters():
    if 'encoder.layer.22' in name:
        break
    param.requires_grad = False

In [115]:
data_collator = DataCollatorWithPadding(tokenizer = bert_tokenizer)

In [117]:
batch_size = 32
epochs = 2
training_args = TrainingArguments(
    output_dir = './qa/results',
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir='./qa/logs',
    save_strategy='epoch',
    logging_steps=10,
    evaluation_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model = qa_bert,
    args = training_args,
    train_dataset = qa_dataset['train'],
    eval_dataset = qa_dataset['test'],
    data_collator = data_collator
)

In [None]:
trainer.evaluate()

In [102]:
answers

[' bloodstre',
 '',
 '',
 '',
 '',
 '',
 '',
 ' dama',
 '',
 ' use electrodes or locally injected chemicals to produce precise patterns of dama',
 '',
 'ral types of dama',
 '',
 '',
 '',
 '',
 ' in the bo',
 '',
 '',
 'control the e',
 'tions from the bra',
 '',
 's such as walking or swimmi',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'balance, taste, and sme',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'ower bra',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ' cort',
 ' to the subcortical motor areas, but also sends a massive projection directly to the spinal co',
 '',
 '',
 '',
 'basal ganglia, and cerebell',
 '',
 'ex, basal ganglia, and cerebell',
 ' cortex sends projections to the subcortical motor areas, but also sends a massive projection directly to the spinal co',
 'g, or swallowi',
 ' to the subcortical motor areas, but also sends a massive projection directly to the spinal co',
 't',
 '',
 'nificance of the issue and the degree of cohesion within the majority p

In [65]:
df.iloc[0]['answers']

"{'text': array(['isolated from the bloodstream'], dtype=object), 'answer_start': array([195])}"

In [87]:
import pandas as pd

def extract_positions(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Ensure required columns exist
    required_columns = {'question', 'context', 'answers'}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"Dataset must contain columns: {required_columns}")
    
    start_positions = []
    end_positions = []
    
    for index, row in df.iterrows():
        question = row['question'].split()
        context = row['context'].split()
        answer = row['answers'][17:row['answers'].find("]")-1]
        answer = answer.split()
#         start_pos = context.find(answer[0])
        try:
            start_pos = next(i for i in range(len(context)) if context[i:i+len(answer)] == answer) 
            end_pos = start_pos + len(answer) - 1
        except StopIteration:
            start_pos = -1e5
            end_pos = -1e5
        start_positions.append(start_pos)
        end_positions.append(end_pos)
    
    # Add new columns
    df['start_position'] = start_positions
    df['end_position'] = end_positions
    
    # Save to a new CSV file
    output_path = file_path.replace('.csv', '_processed.csv')
    df.to_csv(output_path, index=False)
    print(f"Processed file saved as: {output_path}")
    
    return df

# Run function on uploaded file
df_processed = extract_positions('data/qa.csv')
df_processed.head()


Processed file saved as: data/qa_processed.csv


Unnamed: 0.1,Unnamed: 0,id,title,context,question,answers,metadata,start_position,end_position
0,0,7ba1e8f4261d3170fcf42e84a81dd749116fae95,Brain,Another approach to brain function is to exami...,What sare the benifts of the blood brain barrir?,{'text': array(['isolated from the bloodstream...,"{'split': 'train', 'model_in_the_loop': 'Combi...",31.0,34.0
1,1,5ec5ef305a259311596e85d811ade30bd68b079d,Brain,Another approach to brain function is to exami...,What is surrounded by cerebrospinal fluid?,"{'text': array(['brain'], dtype=object), 'answ...","{'split': 'train', 'model_in_the_loop': 'Combi...",3.0,3.0
2,2,7cb230edfb15ad1fda8d157af1f2b574cbb02b4c,Brain,Another approach to brain function is to exami...,What does the skull protect?,"{'text': array(['brain'], dtype=object), 'answ...","{'split': 'train', 'model_in_the_loop': 'Combi...",3.0,3.0
3,3,e1850f2a48b8f7c2231cec41ed63c1b638a8e2c7,Brain,Another approach to brain function is to exami...,What has been injected into rats to produce pr...,"{'text': array(['chemicals'], dtype=object), '...","{'split': 'train', 'model_in_the_loop': 'Combi...",115.0,115.0
4,4,7bc0ae1a8a24ea4f3398b5236ab9569bbc3e820b,Brain,Another approach to brain function is to exami...,What can cause issues with how the brain works?,"{'text': array(['brain damage'], dtype=object)...","{'split': 'train', 'model_in_the_loop': 'Combi...",66.0,67.0


In [69]:
qa_df = df_processed[['question', 'context', 'start_position', 'end_position']]

In [70]:
qa_df.head()

Unnamed: 0,question,context,start_position,end_position
0,What sare the benifts of the blood brain barrir?,Another approach to brain function is to exami...,31.0,34.0
1,What is surrounded by cerebrospinal fluid?,Another approach to brain function is to exami...,3.0,3.0
2,What does the skull protect?,Another approach to brain function is to exami...,3.0,3.0
3,What has been injected into rats to produce pr...,Another approach to brain function is to exami...,115.0,115.0
4,What can cause issues with how the brain works?,Another approach to brain function is to exami...,66.0,67.0


In [88]:
bert_tokenizer.decode(bert_tokenizer.encode(qa_df.iloc[0].question, qa_df.iloc[0].context)[56:61])

'isolated from the bloodstream'

In [75]:
qa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   question        30000 non-null  object 
 1   context         30000 non-null  object 
 2   start_position  17194 non-null  float64
 3   end_position    17194 non-null  float64
dtypes: float64(2), object(2)
memory usage: 937.6+ KB


In [76]:
qa_df.isnull

<bound method DataFrame.isnull of                                                 question  \
0       What sare the benifts of the blood brain barrir?   
1             What is surrounded by cerebrospinal fluid?   
2                           What does the skull protect?   
3      What has been injected into rats to produce pr...   
4        What can cause issues with how the brain works?   
...                                                  ...   
29995                  What is the lowest ISO mentioned?   
29996                 What is the highest ISO mentioned?   
29997              What is Kodak'sbrand name of E6 film?   
29998                         How do these films differ?   
29999  What letter designates what Ektachrome is desi...   

                                                 context  start_position  \
0      Another approach to brain function is to exami...            31.0   
1      Another approach to brain function is to exami...             3.0   
2      Another ap