In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[

In [None]:
import nltk
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
dataset = load_dataset('toughdata/quora-question-answer-dataset')
print(dataset['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/485 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56402 [00:00<?, ? examples/s]

{'question': 'Why whenever I get in the shower my girlfriend want to join?', 'answer': 'Isn’t it awful? You would swear that there wasn’t enough hot water to go around!\n'}


In [None]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.lower() not in stop_words]
    tokens = [ps.stem(w) for w in tokens]  # For stemming
    tokens = [lemmatizer.lemmatize(w) for w in tokens]  # For lemmatization
    return ' '.join(tokens)

# Apply preprocessing
dataset = dataset.map(lambda x: {'question': preprocess(x['question']), 'answer': preprocess(x['answer'])})


Map:   0%|          | 0/56402 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

# Load the tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    inputs = tokenizer(
        examples['question'],
        examples['answer'],
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )
    offset_mapping = inputs.pop('offset_mapping')
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        input_ids = inputs['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sep_index = input_ids.index(tokenizer.sep_token_id)

        # Identify the start and end positions of the answer in the input sequence
        answer_offsets = offset[sep_index + 1:]
        answer_text = examples['answer'][i]

        start_char = 0
        end_char = len(answer_text)

        # Initialize tokens
        start_token = None
        end_token = None
        for idx, (start, end) in enumerate(answer_offsets):
            if start_token is None and start <= start_char < end:
                start_token = sep_index + 1 + idx
            if start <= end_char <= end:
                end_token = sep_index + 1 + idx
                break

        if start_token is None:
            start_token = cls_index  # If no start token is found, default to CLS token
        if end_token is None:
            end_token = cls_index  # If no end token is found, default to CLS token

        start_positions.append(start_token)
        end_positions.append(end_token)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs


# Apply the tokenize function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/56402 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=0.4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['train'],
)

trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
