In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import glob
train_files = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/train/*.json")
test_files = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/test/*.json")

In [None]:
from tqdm import tqdm

# Generate the training publications dataframe
df_train_publications = pd.DataFrame()

for train_file in tqdm(train_files):
    file_data = pd.read_json(train_file)
    file_data.insert(0,'pub_id', train_file.split('/')[-1].split('.')[0].replace('train\\', ''))
    df_train_publications = pd.concat([df_train_publications, file_data])

df_train_publications

In [None]:
# Generate the testing publications dataframe
df_test_publications = pd.DataFrame()

for test_file in tqdm(test_files):
    file_data = pd.read_json(test_file)
    file_data.insert(0,'pub_id', test_file.split('/')[-1].split('.')[0].replace('test\\', ''))
    df_test_publications = pd.concat([df_test_publications, file_data])

df_test_publications

In [None]:
train_csv = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
train_csv

In [None]:
def concat(column):
    res = ' '
    for st in column:
        if type(st) == str:
            res += st
    return res

In [None]:
train = df_train_publications.groupby('pub_id')['text'].apply(concat).reset_index()
# train = df_train_publications
train.loc[train['pub_id'].isin(train_csv['Id']), 'cleaned_label'] = train_csv.loc[train_csv['Id'].isin(train['pub_id']),'cleaned_label']

In [None]:
train.cleaned_label.value_counts()

In [None]:
new_train = pd.DataFrame([])
train = train.set_index(pd.Index(range(len(train))))
for idx in train.index:
    if train.loc[idx, 'text'].find(train.loc[idx, 'cleaned_label']) >= 0:
        new_train = pd.concat([new_train, train.loc[idx:idx+1, :].drop(index=idx+1)], axis=0)

In [None]:
len(new_train)

In [None]:
new_train

In [None]:
#test = df_test_publications.groupby('pub_id')['text'].apply(concat).reset_index()

In [None]:
len(train)

In [None]:
training_size = 200
sentences = new_train['text']
labels = new_train['cleaned_label']

train_contexts = sentences[training_size:]

train_answers = labels[training_size:]

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
train_answers = train_answers.to_list()
train_contexts = train_contexts.to_list()

train_answers = [{"text":answ} for answ in train_answers]


def add_end_idx(answers, contexts):
    i = 0

    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = context.find(answer['text'])
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answers[i]['answer_start'] = start_idx
            answers[i]['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answers[i]['answer_start'] = start_idx - 1
            answers[i]['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answers[i]['answer_start'] = start_idx - 2
            answers[i]['answer_end'] = end_idx - 2     # When the gold label is off by two characters
        else:
            answers[i]['answer_start'] = start_idx + 1 
        i+=1

add_end_idx(train_answers, train_contexts)


In [None]:
train_answers

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_questions = ['What is the dataset?']*len(train_contexts)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)


In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

In [None]:
model.eval()