MACHINE LEARNING SYSTEM

PIPELINE:

Train extractive model(DeBerta) since it gave best results on squad qs-ans dataset->

Train machine translation model helsinki on opus/iitb en-fr dataset->

user asks the qs->

send qs to Deberta->

get ans->

send answer to translator->

output french answer.

In [1]:
!pip install transformers datasets evaluate accelerate



In [2]:
import torch
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
import numpy as np

# -------------------------
# 1. Configuration
# -------------------------
MODEL ="microsoft/deberta-v3-base"

MAX_LENGTH = 512
DOC_STRIDE = 128
BATCH_SIZE = 4
NUM_EPOCHS = 2
OUTPUT_DIR = "./qa_model_DEBERT/"

In [3]:
# 2. Load Dataset
# -------------------------
from datasets import load_dataset

dataset = load_dataset("squad")
train_dataset = dataset['train'].select(range(3000))
val_dataset = dataset['validation'].select(range(300))




# -------------------------
# 3. Load Evaluation Metric
# -------------------------
metric = evaluate.load("squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# 4. Preprocessing Function
# -------------------------
def preprocess_function(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    #offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0

        sequence_id = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_id[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_id[token_end_index] != 1:
                token_end_index -= 1

            if offsets[token_start_index][0] > end_char or offsets[token_end_index][1] < start_char:
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples



In [5]:
    print(f"\n===== Training {MODEL} =====\n")
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL)

    tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    tokenized_val = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/{MODEL}",
        #evaluation_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=0.01,
        save_total_limit=1,
        logging_steps=10,
        save_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=default_data_collator
    )

    # Train
    trainer.train()

    # Evaluate
    print(f"Evaluating {MODEL}...")
    raw_predictions = trainer.predict(tokenized_val)
    start_logits, end_logits = raw_predictions.predictions

    def postprocess_qa_predictions(examples, features, raw_predictions, tokenizer, n_best_size=20, max_answer_length=30):
        all_start_logits, all_end_logits = raw_predictions
        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
        features_per_example = {}
        for i, feature in enumerate(features):
            example_id = feature["id"]
            if example_id not in features_per_example:
                features_per_example[example_id] = []
            features_per_example[example_id].append(i)

        predictions = {}
        for example_id, feature_indices in features_per_example.items():
            context = examples[example_id_to_index[example_id]]["context"]
            min_null_score = None
            valid_answers = []

            for feature_index in feature_indices:
                start_logits_i = all_start_logits[feature_index]
                end_logits_i = all_end_logits[feature_index]
                offset_mapping = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logits_i)[-1: -n_best_size - 1: -1].tolist()
                end_indexes = np.argsort(end_logits_i)[-1: -n_best_size - 1: -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        if start_index >= len(offset_mapping) or end_index >= len(offset_mapping):
                            continue
                        if offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                            continue
                        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                            continue
                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append({"score": start_logits_i[start_index] + end_logits_i[end_index],
                                              "text": context[start_char:end_char]})
            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
                predictions[example_id] = best_answer["text"]
            else:
                predictions[example_id] = ""

        return predictions

    # Convert features to list of dicts for postprocessing
    val_features = tokenized_val
    examples = val_dataset
    predictions = postprocess_qa_predictions(examples, val_features, (start_logits, end_logits), tokenizer)

    # Prepare references
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in val_dataset]

    # Compute metric
    em_f1 = metric.compute(predictions=[{"id": k, "prediction_text": v} for k, v in predictions.items()],
                           references=references)
    print(f"{MODEL} EM: {em_f1['exact_match']:.2f}, F1: {em_f1['f1']:.2f}")




===== Training microsoft/deberta-v3-base =====



Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss
10,5.3656
20,4.6604
30,4.2836
40,3.92
50,3.5192
60,3.0357
70,2.6168
80,2.147
90,1.4858
100,1.55


Evaluating microsoft/deberta-v3-base...


microsoft/deberta-v3-base EM: 84.00, F1: 88.60


In [6]:
# Save final DeBERTa model
save_path = f"{OUTPUT_DIR}/deberta-v3-base-final"
trainer.save_model(save_path)
trainer.tokenizer.save_pretrained(save_path)
print(f"✅ Model saved at {save_path}")


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


✅ Model saved at ./qa_model_DEBERT//deberta-v3-base-final


In [7]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

MODEL_PATH = f"{OUTPUT_DIR}/deberta-v3-base-final"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [8]:
context = "The Eiffel Tower was built in 1889 and is located in Paris, France."
question = "When was the Eiffel Tower built?"
result = qa_pipeline(question=question, context=context)
print(result)


{'score': 0.9978575706481934, 'start': 29, 'end': 34, 'answer': ' 1889'}


TRANSLATOR

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

In [10]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("opus_books", "en-fr")

# Original train dataset
train_dataset = dataset["train"]

# Create a validation split (e.g., 5% of the train set)
split = train_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split['train']
val_dataset = split['test']

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")

Train size: 120730
Validation size: 6355


In [11]:
train_dataset = train_dataset.select(range(3000))  # first 2000 examples
val_dataset = val_dataset.select(range(300))

In [12]:
MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)



In [13]:
# ===== 3. Tokenization =====
def preprocess(batch):
    # Columns in opus_books: 'translation' dict containing 'en' and 'de'
    inputs = tokenizer([t["en"] for t in batch["translation"]],
                       truncation=True, padding="max_length", max_length=64)
    targets = tokenizer([t["fr"] for t in batch["translation"]],
                        truncation=True, padding="max_length", max_length=64)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = targets.input_ids
    return batch

train_tokenized = train_dataset.map(preprocess, batched=True)
val_tokenized = val_dataset.map(preprocess, batched=True)

In [14]:
args = Seq2SeqTrainingArguments(
    output_dir="./opus-mt-en-fr-finetuned",
    eval_steps=50,
    save_steps=100,
    logging_steps=20,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)


In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [17]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
[34m[1mwandb[0m: Currently logged in as: [33mdeshpandesamruddhicharudatta[0m ([33mdeshpandesamruddhicharudatta-nationak-institue-of-techno[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,6.3219
40,4.2071
60,3.9186
80,3.055
100,3.4803
120,3.855
140,3.4229
160,2.8764
180,3.0331
200,3.1999




TrainOutput(global_step=6000, training_loss=2.0961401974360148, metrics={'train_runtime': 2761.2931, 'train_samples_per_second': 2.173, 'train_steps_per_second': 2.173, 'total_flos': 101695094784000.0, 'train_loss': 2.0961401974360148, 'epoch': 2.0})

In [18]:
trainer.save_model("./opus-mt-en-de-finetuned")
tokenizer.save_pretrained("./opus-mt-en-de-finetuned")

('./opus-mt-en-de-finetuned/tokenizer_config.json',
 './opus-mt-en-de-finetuned/special_tokens_map.json',
 './opus-mt-en-de-finetuned/vocab.json',
 './opus-mt-en-de-finetuned/source.spm',
 './opus-mt-en-de-finetuned/target.spm',
 './opus-mt-en-de-finetuned/added_tokens.json')

In [19]:
model.to("cuda" if torch.cuda.is_available() else "cpu")

examples = [
    "Hello, How are you",
    "I love pizza",
    "Virat Kohli is the best cricketer of all time",
]

for text in examples:
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    output_tokens = model.generate(**inputs, num_beams=4, max_length=64)
    translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    print(f"Input: {text}")
    print(f"Translation: {translated_text}")
    print("-" * 50)

Input: Hello, How are you
Translation: Où, Howez-vous
--------------------------------------------------
Input: I love pizza
Translation: Je suis riais...
--------------------------------------------------
Input: Virat Kohli is the best cricketer of all time
Translation: Virat Kohli est le meilleurable cieux de toujours
--------------------------------------------------


In [22]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
import torch

# ----------------------------
# 1. Load your saved models
# ----------------------------
# QA model (DeBERTa)

qa_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)  # replace with your DeBERTa model path
qa_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)

# Translator model (English -> French)
trans_tokenizer = AutoTokenizer.from_pretrained("./opus-mt-en-de-finetuned")
trans_model = AutoModelForSeq2SeqLM.from_pretrained("./opus-mt-en-de-finetuned")

# ----------------------------
# 2. User input
# ----------------------------
context = input("Enter context (text passage):\n")
question = input("Enter your question:\n")

# ----------------------------
# 3. Get QA answer in English
# ----------------------------
inputs = qa_tokenizer.encode_plus(question, context, return_tensors="pt")
with torch.no_grad():
    outputs = qa_model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Get the most likely start/end token
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)

answer_tokens = inputs["input_ids"][0][start_index:end_index+1]
answer_en = qa_tokenizer.decode(answer_tokens, skip_special_tokens=True)
print("\nAnswer in English:", answer_en)

# ----------------------------
# 4. Translate answer to French
# ----------------------------
trans_inputs = trans_tokenizer.encode(answer_en, return_tensors="pt")
with torch.no_grad():
    translated = trans_model.generate(trans_inputs, max_length=50)

answer_fr = trans_tokenizer.decode(translated[0], skip_special_tokens=True)
print("Answer in French:", answer_fr)


Enter context (text passage):
The Great Wall of China is a historic fortification built along northern China. It stretches over 13,000 miles and is considered one of the most impressive architectural feats in human history.
Enter your question:
Why is the Great Wall of China famous?

Answer in English: historic fortification built along northern China
Answer in French: était labord-huis de lautre China
