In [2]:
!pip install datasets

import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from datasets import Dataset
import pandas as pd

# Function to generate synthetic data
def generate_synthetic_data():
    data = [
        {"question": "What is machine learning?", "context": "Machine learning is a field of computer science that uses statistical techniques to give computer systems the ability to learn from data."},
        {"question": "Explain artificial intelligence.", "context": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems."},
        {"question": "How does deep learning work?", "context": "Deep learning is a subset of machine learning where artificial neural networks, algorithms inspired by the human brain, learn from large amounts of data."},
        {"question": "মেশিন লার্নিং কী?", "context": "মেশিন লার্নিং হল কম্পিউটার বিজ্ঞানের একটি ক্ষেত্র যা ডেটা থেকে শিক্ষা নেওয়ার জন্য পরিসংখ্যানগত পদ্ধতি ব্যবহার করে।"},
        {"question": "কৃত্রিম বুদ্ধিমত্তা কী?", "context": "কৃত্রিম বুদ্ধিমত্তা মেশিন, বিশেষত কম্পিউটার সিস্টেম দ্বারা মানব বুদ্ধিমত্তা প্রক্রিয়াকরণের অনুকরণ।"},
        {"question": "ডিপ লার্নিং কীভাবে কাজ করে?", "context": "ডিপ লার্নিং হল মেশিন লার্নিংয়ের একটি উপশাখা যেখানে মানব মস্তিষ্ক অনুপ্রাণিত কৃত্রিম নিউরাল নেটওয়ার্ক, বৃহত পরিমাণে ডেটা থেকে শিক্ষা নেয়।"}
    ]
    return pd.DataFrame(data)

# Load dataset
df = generate_synthetic_data()
dataset = Dataset.from_pandas(df)

# Load model and tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Function to get answers from the model
def get_answer(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)  # Index of the start of the answer
    answer_end = torch.argmax(answer_end_scores) + 1  # Index of the end of the answer

    # Ensure the answer is within the string and remove special tokens
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer.strip()

# Display the model's answers to each question
for index, row in df.iterrows():
    print(f"Question: {row['question']}")
    print(f"Answer: {get_answer(row['question'], row['context'])}\n")

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: What is machine learning?
Answer: is machine learning ? [SEP] Machine learning is a field of computer science that uses statistical techniques to give computer systems the ability to learn from data .

Question: Explain artificial intelligence.
Answer: [SEP]

Question: How does deep learning work?
Answer: 

Question: মেশিন লার্নিং কী?
Answer: ##েওয়ার জন্য পরিসংখ্যানগত পদ্ধতি ব্যবহার করে । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA