In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import notebook_login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)
print(squad["train"][0])

{'id': '56cfd6f8234ae51400d9bf75', 'title': 'IPod', 'context': 'Since October 2004, the iPod line has dominated digital music player sales in the United States, with over 90% of the market for hard drive-based players and over 70% of the market for all types of players. During the year from January 2004 to January 2005, the high rate of sales caused its U.S. market share to increase from 31% to 65% and in July 2005, this market share was measured at 74%. In January 2007 the iPod market share reached 72.7% according to Bloomberg Online.', 'question': "What did the iPod's US market share peak at in 2005?", 'answers': {'text': ['74%'], 'answer_start': [390]}}


In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")




In [8]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [9]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [10]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [11]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [12]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="replicating_tutorial",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
c:\Users\eyalw\Desktop\Cours\Study_Project\repo\detection_fraude\chat_bot_eyal\replicating_tutorial is already a clone of https://huggingface.co/Hoodog/replicating_tutorial. Make sure you pull the latest changes with `repo.git_pull()`.


In [13]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])

Epoch 1/3

Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x27f4e3ee790>

In [72]:
question = "Is a house a fruit?"
context = "Vegetables are edible plants that are often consumed for their nutritional value. They include a variety of plant parts such as roots, stems, leaves, and fruits."
#context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [73]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="replicating_tutorial")
question_answerer(question=question, context=context)

Some layers from the model checkpoint at replicating_tutorial were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at replicating_tutorial and are newly initialized: ['dropout_359']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.40915682911872864, 'start': 0, 'end': 10, 'answer': 'Vegetables'}

In [60]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("replicating_tutorial")
inputs = tokenizer(question, context, return_tensors="tf")

In [61]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("replicating_tutorial")
outputs = model(**inputs)

Some layers from the model checkpoint at replicating_tutorial were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at replicating_tutorial and are newly initialized: ['dropout_259']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

In [63]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'france'