In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
from datasets import Dataset

In [None]:
csv_path = "SQuAD.csv"
df = pd.read_csv(csv_path)

In [None]:
df["answers"] = df.apply(lambda x: {"text": [x["text"]], "answer_start": [x["answer_start"]]}, axis=1)
df = df.drop(columns=["#"], errors="ignore")

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [None]:
model_name = "distilbert-base-uncased"  # Small & efficient BERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        # Check if there are no answers or if the answer text is invalid.
        # An answer is considered invalid if its 'answer_start' list is empty,
        # or if the first element of 'text' is None, or if it's an empty string.
        if len(answers["answer_start"]) == 0 or answers["text"][0] is None or (isinstance(answers["text"][0], str) and len(answers["text"][0]) == 0):
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answers["answer_start"][0]
            answer_text = answers["text"][0]

            # Ensure answer_text is a string before calling len()
            if not isinstance(answer_text, str):
                answer_text = str(answer_text)

            end_char = start_char + len(answer_text)
            sequence_ids = inputs.sequence_ids(i)

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(inputs["input_ids"][i]) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(0)
                end_positions.append(0)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# 5️⃣ Tokenize
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/69456 [00:00<?, ? examples/s]

Map:   0%|          | 0/17365 [00:00<?, ? examples/s]

In [None]:
# 6️⃣ Training Configuration
!pip install --upgrade transformers
training_args = TrainingArguments(
    output_dir="./faq_csv_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()

trainer.save_model("./faq_csv_model")

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msteephanrajsanjay[0m ([33msteephanrajsanjay-coimbatore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.236,1.107349
2,0.9412,1.089826


In [None]:
qa_pipeline = pipeline("question-answering", model="./faq_csv_model", tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
context = """

Evaluation metrics are quantitative measures used to assess the performance and effectiveness of statistical or machine learning models, providing insights into how well a model is performing and aiding in the comparison of different models or algorithms.
 These metrics are crucial for evaluating predictive ability, generalization capability, and overall quality, ensuring models are reliable and capable of making accurate predictions on new, unseen data.
 They are essential for guiding model improvements and determining whether a model is ready for deployment

For classification tasks, common evaluation metrics include accuracy, precision, recall (also known as sensitivity), F1 score, specificity, and the area under the ROC curve (AUC-ROC).
 Accuracy measures the percentage of correctly predicted labels out of total predictions, but it can be misleading in imbalanced datasets.
 Precision indicates the proportion of correctly predicted positive results among all predicted positives, which is vital when minimizing false positives is critical, such as in spam detection or medical diagnostics.
 Recall measures the model's ability to identify all actual positive instances, making it essential in scenarios where missing true positives has severe consequences, like disease detection.
 The F1 score, the harmonic mean of precision and recall, provides a balanced measure, particularly useful in imbalanced datasets


 """

question = "definition of the Classification Metrics?"

result = qa_pipeline(question=question, context=context)
print(result)

{'score': 0.013782382942736149, 'start': 277, 'end': 366, 'answer': 'crucial for evaluating predictive ability, generalization capability, and overall quality'}


In [None]:
from google.colab import files
!zip -r faq_csv_model.zip faq_csv_model
files.download("faq_csv_model.zip")

  adding: faq_csv_model/ (stored 0%)
  adding: faq_csv_model/checkpoint-6000/ (stored 0%)
  adding: faq_csv_model/checkpoint-6000/rng_state.pth (deflated 26%)
  adding: faq_csv_model/checkpoint-6000/trainer_state.json (deflated 68%)
  adding: faq_csv_model/checkpoint-6000/training_args.bin (deflated 53%)
  adding: faq_csv_model/checkpoint-6000/config.json (deflated 43%)
  adding: faq_csv_model/checkpoint-6000/special_tokens_map.json (deflated 42%)
  adding: faq_csv_model/checkpoint-6000/model.safetensors (deflated 8%)
  adding: faq_csv_model/checkpoint-6000/vocab.txt (deflated 53%)
  adding: faq_csv_model/checkpoint-6000/optimizer.pt (deflated 13%)
  adding: faq_csv_model/checkpoint-6000/scheduler.pt (deflated 61%)
  adding: faq_csv_model/checkpoint-6000/tokenizer_config.json (deflated 75%)
  adding: faq_csv_model/checkpoint-6000/tokenizer.json (deflated 71%)
  adding: faq_csv_model/checkpoint-17500/ (stored 0%)
  adding: faq_csv_model/checkpoint-17500/rng_state.pth (deflated 26%)
  ad

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r /content/faq_csv_model /content/drive/MyDrive/

Mounted at /content/drive
cp: cannot stat '/content/faq_csv_model': No such file or directory


In [4]:
import os, shutil

In [5]:
base_dir = "/content/drive/MyDrive/faq_csv_model"
latest_checkpoint = "checkpoint-17552"
final_dest = "/content/drive/MyDrive/faq_folder/final_model"

In [6]:
src = os.path.join(base_dir, latest_checkpoint)
shutil.copytree(src, final_dest)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/faq_csv_model/checkpoint-17552'