In [1]:
import requests
import xml.etree.ElementTree as ET

def fetch_arxiv_papers(query="artificial intelligence", max_results=100):
    url = f"http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}"
    response = requests.get(url)
    root = ET.fromstring(response.content)
    papers = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
        papers.append({"title": title, "summary": summary})
    return papers

papers = fetch_arxiv_papers()
for paper in papers:
    print(f"Title: {paper['title']}\nSummary: {paper['summary']}\n")


Title: The Governance of Physical Artificial Intelligence
Summary:   Physical artificial intelligence can prove to be one of the most important
challenges of the artificial intelligence. The governance of physical
artificial intelligence would define its responsible intelligent application in
the society.


Title: Does an artificial intelligence perform market manipulation with its own
  discretion? -- A genetic algorithm learns in an artificial market simulation
Summary:   Who should be charged with responsibility for an artificial intelligence
performing market manipulation have been discussed. In this study, I
constructed an artificial intelligence using a genetic algorithm that learns in
an artificial market simulation, and investigated whether the artificial
intelligence discovers market manipulation through learning with an artificial
market simulation despite a builder of artificial intelligence has no intention
of market manipulation. As a result, the artificial intelligence di

In [2]:
# Remove unnecessary chaaacters from the summary
def clean_text(text):
    # Return the text only if it's not None, otherwise return an empty string
    return " ".join(text.split()) if text is not None else ""

for paper in papers:
    paper["title"] = clean_text(paper["title"])
    paper["summary"] = clean_text(paper["summary"])

In [3]:
print(f"Title: {paper['title']}\nSummary: {paper['summary']}\n")

Title: A Definition of Artificial Intelligence
Summary: In this paper we offer a formal definition of Artificial Intelligence and this directly gives us an algorithm for construction of this object. Really, this algorithm is useless due to the combinatory explosion. The main innovation in our definition is that it does not include the knowledge as a part of the intelligence. So according to our definition a newly born baby also is an Intellect. Here we differs with Turing's definition which suggests that an Intellect is a person with knowledge gained through the years.



Gnerate QA pairs

In [4]:
qa_pairs = []
for paper in papers:
    question = f"What is the research focus of the paper titled '{paper['title']}'?"
    answer = paper["summary"] if paper["summary"] else "No summary available"
    qa_pairs.append({"question": question, "answer": answer})

for qa_pair in qa_pairs:
    print(f"Question: {qa_pair['question']}\nAnswer: {qa_pair['answer']}\n")

Question: What is the research focus of the paper titled 'The Governance of Physical Artificial Intelligence'?
Answer: Physical artificial intelligence can prove to be one of the most important challenges of the artificial intelligence. The governance of physical artificial intelligence would define its responsible intelligent application in the society.

Question: What is the research focus of the paper titled 'Does an artificial intelligence perform market manipulation with its own discretion? -- A genetic algorithm learns in an artificial market simulation'?
Answer: Who should be charged with responsibility for an artificial intelligence performing market manipulation have been discussed. In this study, I constructed an artificial intelligence using a genetic algorithm that learns in an artificial market simulation, and investigated whether the artificial intelligence discovers market manipulation through learning with an artificial market simulation despite a builder of artificial 

Save data in a json file

In [5]:
import json
with open("arxiv_qa_dataset.json", "w") as f:
    json.dump(qa_pairs, f, indent=4)

In [6]:
# Load the data from the JSON file
with open("arxiv_qa_dataset.json", "r") as f:
    data = json.load(f)

# Print some entries to check the data
for entry in data[:5]:  # Print first 5 entries
    print(f"Question: {entry['question']}\nAnswer: {entry['answer']}\n")

Question: What is the research focus of the paper titled 'The Governance of Physical Artificial Intelligence'?
Answer: Physical artificial intelligence can prove to be one of the most important challenges of the artificial intelligence. The governance of physical artificial intelligence would define its responsible intelligent application in the society.

Question: What is the research focus of the paper titled 'Does an artificial intelligence perform market manipulation with its own discretion? -- A genetic algorithm learns in an artificial market simulation'?
Answer: Who should be charged with responsibility for an artificial intelligence performing market manipulation have been discussed. In this study, I constructed an artificial intelligence using a genetic algorithm that learns in an artificial market simulation, and investigated whether the artificial intelligence discovers market manipulation through learning with an artificial market simulation despite a builder of artificial 

In [7]:
# data

Preprocess the data set

In [8]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
# tokenized_dataset = [
#     tokenizer(
#         (question if question else "No question provided") + tokenizer.sep_token + 
#         (answer if answer else "No answer provided"), 
#         truncation=True
#     )
#     for qa in qa_pairs
#     for question, answer in [(qa["question"], qa["answer"])]
# ]


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")

# Initialize the tokenized dataset list
tokenized_dataset = []

# Iterate through the QA pairs
for qa in qa_pairs:
    question = qa["question"]
    answer = qa["answer"]

    # Debug: Check the type of question and answer
    if not isinstance(question, str):
        print(f"Warning: Expected a string for 'question', but got {type(question)}")
        question = str(question) if question else "No question provided"
    
    if not isinstance(answer, str):
        print(f"Warning: Expected a string for 'answer', but got {type(answer)}")
        answer = str(answer) if answer else "No answer provided"
    
    # Concatenate question and answer safely
    combined_input = question + str(tokenizer.sep_token) + answer
    
    # Tokenize the combined input and append to the dataset
    tokenized_dataset.append(tokenizer(combined_input, truncation=True))

# Debug: Check some of the tokenized dataset entries
print(tokenized_dataset[:2])


  from .autonotebook import tqdm as notebook_tqdm


[{'input_ids': [3838, 374, 279, 3412, 5244, 315, 279, 5567, 24849, 364, 785, 80589, 315, 27379, 58194, 21392, 69990, 4064, 39253, 20443, 11229, 646, 12118, 311, 387, 825, 315, 279, 1429, 2989, 11513, 315, 279, 20443, 11229, 13, 576, 34086, 315, 6961, 20443, 11229, 1035, 6979, 1181, 8480, 24514, 3766, 304, 279, 8232, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, {'input_ids': [3838, 374, 279, 3412, 5244, 315, 279, 5567, 24849, 364, 21468, 458, 20443, 11229, 2736, 3081, 33686, 448, 1181, 1828, 30922, 30, 1177, 362, 18929, 12111, 46210, 304, 458, 20443, 3081, 19038, 69990, 4064, 15191, 1265, 387, 11430, 448, 11752, 369, 458, 20443, 11229, 16380, 3081, 33686, 614, 1012, 14078, 13, 758, 419, 3920, 11, 358, 20346, 458, 20443, 11229, 1667, 264, 18929, 12111, 429, 46210, 304, 458, 20443, 3081, 19038, 11, 323, 26219, 3425, 279, 20443, 11229, 51014, 3081, 33686,

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
tokenized_dataset = [
    tokenizer(question + tokenizer.sep_token + answer, truncation=True)
    for qa in qa_pairs
    for question, answer in [(qa["question"], qa["answer"])]
]


In [10]:
# # Print the first few tokenized inputs for verification
# for i, tokenized_item in enumerate(tokenized_dataset[:5]):  # Print the first 5 tokenized entries
#     print(f"Example {i + 1}:")
#     print("Original Question:", qa_pairs[i]["question"])
#     print("Original Answer:", qa_pairs[i]["answer"])
#     print("Tokenized:", tokenized_item)
#     print("\n")


Split the data set

In [11]:
from sklearn.model_selection import train_test_split

# Splitting tokenized data into train and test sets
train_tokenized, test_tokenized = train_test_split(tokenized_dataset, test_size=0.2, random_state=42)

# Now, you can use train_tokenized and test_tokenized for training and evaluation


Prepare the data set to feed

Create a pytorch data set

In [28]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torch

# Custom Dataset class for tokenized data
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.tokenized_data[idx]['input_ids']),
            'attention_mask': torch.tensor(self.tokenized_data[idx]['attention_mask']),
            'labels': torch.tensor(self.tokenized_data[idx]['input_ids']),  # For causal language modeling, labels are usually input_ids
        }



In [29]:
train_dataset = QADataset(train_tokenized)
test_dataset = QADataset(test_tokenized)

Fine tune the model

In [32]:
import torch
print(torch.__version__)

2.6.0+cpu


In [3]:
# # For the acceleration of the training process, 
# from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     num_train_epochs=3,
#     save_total_limit=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
# )

# trainer.train()


In [39]:
import torch
import transformers

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")


PyTorch version: 2.6.0+cpu
Transformers version: 4.49.0


Create a tensorflow data set

In [None]:
# import tensorflow as tf

# # Function to convert the tokenized data into a TensorFlow Dataset
# def create_tf_dataset(tokenized_dataset):
#     input_ids = [item['input_ids'] for item in tokenized_dataset]
#     attention_masks = [item['attention_mask'] for item in tokenized_dataset]
    
#     # The labels are the same as input_ids for language modeling
#     labels = input_ids  # Use input_ids as labels for language modeling tasks
    
#     # Convert the data into TensorFlow Dataset format
#     tf_dataset = tf.data.Dataset.from_tensor_slices(
#         {
#             "input_ids": input_ids,
#             "attention_mask": attention_masks,
#             "labels": labels,  # Add labels here
#         }
#     )
#     return tf_dataset

In [2]:
# # Create the TensorFlow dataset
# train_dataset = create_tf_dataset(train_tokenized)
# test_dataset = create_tf_dataset(test_tokenized)

# # Shuffle and batch the dataset
# train_dataset = train_dataset.shuffle(1000).batch(4)  # Adjust batch size as needed

Try tokenizing again

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")

def tokenize_qa_pairs(qa_pairs):
    questions = [qa["question"] for qa in qa_pairs]
    answers = [qa["answer"] for qa in qa_pairs]
    
    # Tokenize the questions and answers
    tokenized = tokenizer(
        questions,
        answers,
        padding=True,
        truncation=True,
        max_length=512,  # You can adjust max_length based on your model and memory
        return_tensors="np",  # Use numpy format for TensorFlow
    )
    
    return tokenized

tokenized_qa = tokenize_qa_pairs(data)
print(tokenized_qa.keys())  # Check the

dict_keys(['input_ids', 'attention_mask'])


In [13]:
from sklearn.model_selection import train_test_split

# Split the data into train and validation sets (80% train, 20% validation)
input_ids_train, input_ids_val, attention_mask_train, attention_mask_val, labels_train, labels_val = train_test_split(
    tokenized_qa["input_ids"],
    tokenized_qa["attention_mask"],
    tokenized_qa["input_ids"],  # For language models, labels are typically the same as input_ids
    test_size=0.2,  # 20% for validation
    random_state=42
)

# Check the sizes of the splits
print(f"Training set size: {len(input_ids_train)}")
print(f"Validation set size: {len(input_ids_val)}")

Training set size: 80
Validation set size: 20


In [14]:
import tensorflow as tf
def create_tf_dataset(input_ids, attention_mask, labels):
    tf_dataset = tf.data.Dataset.from_tensor_slices({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    })
    return tf_dataset

# Create TensorFlow datasets for training and validation
train_dataset = create_tf_dataset(input_ids_train, attention_mask_train, labels_train)
val_dataset = create_tf_dataset(input_ids_val, attention_mask_val, labels_val)

# Shuffle and batch the datasets
train_dataset = train_dataset.shuffle(1000).batch(4).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(4).prefetch(tf.data.AUTOTUNE)

# Check a batch from the training dataset
for batch in train_dataset.take(1):
    print(batch)

# Check a batch from the validation dataset
for batch in val_dataset.take(1):
    print(batch)


{'input_ids': <tf.Tensor: shape=(4, 326), dtype=int32, numpy=
array([[  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643]])>, 'attention_mask': <tf.Tensor: shape=(4, 326), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(4, 326), dtype=int32, numpy=
array([[  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 151643]])>}
{'input_ids': <tf.Tensor: shape=(4, 326), dtype=int32, numpy=
array([[  3838,    374,    279, ..., 151643, 151643, 151643],
       [  3838,    374,    279, ..., 151643, 151643, 1516

careate tf datset

In [12]:
# import tensorflow as tf

# # Create TensorFlow dataset from tokenized data
# def create_tf_dataset(tokenized_data):
#     input_ids = tokenized_data["input_ids"]
#     attention_mask = tokenized_data["attention_mask"]
#     labels = input_ids  # For language modeling, labels are usually the same as input_ids
    
#     # Create TensorFlow Dataset
#     tf_dataset = tf.data.Dataset.from_tensor_slices({
#         "input_ids": input_ids,
#         "attention_mask": attention_mask,
#         "labels": labels,
#     })
    
#     return tf_dataset

# # Create TensorFlow dataset for training
# train_dataset = create_tf_dataset(tokenized_qa)

# # Check a batch of the dataset
# for batch in train_dataset.take(1):
#     print(batch)

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the tokenizer and model
model_name = "Qwen/Qwen2.5-3B-Instruct"  # Replace with the actual model name you are using
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [01:08<00:00, 34.50s/it]


In [16]:
# Save the model and tokenizer locally
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.json',
 './saved_model\\merges.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [21]:
from auto_gptq import AutoGPTQForCausalLM, AutoGPTQTokenizer
from transformers import AutoTokenizer


ModuleNotFoundError: No module named 'auto_gptq'

Loading the model with 4bit quantization

In [20]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Path to save quantized model
quantized_model_path = "./quantized_model"

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-3B-Instruct"  # Replace with the actual model name you are using
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.float16  # Use float16 for computation
)

# Load and quantize the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",  # Automatically map model layers to available devices (e.g., GPU/CPU)
)

# Save the quantized model
model.save_pretrained(quantized_model_path)
tokenizer.save_pretrained(quantized_model_path)

print(f"Quantized model saved at {quantized_model_path}")


PackageNotFoundError: No package metadata was found for bitsandbytes

bitsandbytesconfig use


In [18]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# model_name = "Qwen/Qwen2.5-3B-Instruct"

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Set 4-bit quantization configuration
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",  # NormalFloat4 (nf4) quantization
#     bnb_4bit_compute_dtype="float16",  # Use float16 for computation
#     bnb_4bit_use_double_quant=True  # Optional: double quantization
# )

# # Load the model with 4-bit quantization
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)


In [1]:
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)

Save the model in gg

use auto_gptq

In [19]:
# from transformers import AutoTokenizer
# from auto_gptq.modeling import AutoGPTQForCausalLM
# from auto_gptq.quantization import QuantizationConfig

# # from auto_gptq import AutoGPTQForCausalLM, QuantizationConfig

# # Define model name


peft library from hugging face

In [20]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Load the model and tokenizer
# model_name = "Qwen/Qwen2.5-3B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# Apply PEFT quantization (you can adjust settings for your model)
peft_model = get_peft_model(model, LoraConfig(task_type=TaskType.CAUSAL_LM))

# Save the quantized model
peft_model.save_pretrained("quantized_model_peft")


ModuleNotFoundError: No module named 'peft'

use gptq