<a href="https://colab.research.google.com/github/SteelRaven7dev/question-generating-ai/blob/main/INJAZ_V1_LLaMA_2_13B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers accelerate torch datasets # using LLaMA 13B now, use llama 3.2 later

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

# load model and tokensier
model_name = "meta-llama/Llama-2-13b-hf"  # Or 'Llama-2-7b-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

In [None]:
from datasets import load_dataset
#DATASET NEEDS TO BE MADE RN
# Using data in CSV for now im calling it test.csv
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
train_dataset = dataset['train']
test_dataset = dataset['test']

# inspect dataset
print(train_dataset[0])

In [None]:
#example syllabus traning data via chatgpt
training_data = [
    {
        "syllabus": "Photosynthesis: Conversion of light energy to chemical energy.",
        "question": "Explain the role of light in photosynthesis.",
        "mark_scheme": "Light provides the energy required to drive the process of photosynthesis, particularly the light-dependent reactions."
    },
    {
        "syllabus": "Newton's Laws of Motion: Inertia, force, and acceleration.",
        "question": "State Newton's first law of motion.",
        "mark_scheme": "An object at rest stays at rest and an object in motion stays in motion unless acted upon by an external force."
    }
]

In [None]:
def tokenize_with_syllabus(examples): #change max length for longer tokens
    syllabus_tokens = tokenizer(examples['syllabus'], padding="max_length", truncation=True, max_length=256)
    question_tokens = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=256)
    markscheme_tokens = tokenizer(examples['mark_scheme'], padding="max_length", truncation=True, max_length=256)

    # combine syllabus with questions and mark scheme for training
    input_ids = syllabus_tokens['input_ids'] + question_tokens['input_ids']
    # create labels from mark scheme
    labels = markscheme_tokens['input_ids']

    return {
        'input_ids': input_ids,
        'labels': labels
    }

# tokensie the dataset
tokenized_data = [tokenize_with_syllabus(entry) for entry in training_data]

In [None]:
class CustomDataset(torch.utils.data.Dataset): # improve efficiency by allowing for parallel gpu compute
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.tokenized_data[idx]['input_ids']),
            'labels': torch.tensor(self.tokenized_data[idx]['labels']),
        }
train_dataset = CustomDataset(tokenized_data)

In [None]:
for i, data in enumerate(tokenized_data):
    print(f"Example {i + 1}:")
    print("Input IDs:", data['input_ids'])
    print("Labels:", data['labels'])
    print("\n")

In [None]:
from transformers import Trainer, TrainingArguments
# traning here - DO NOT NOT NOT TOUCH
# probably want to keep around 20-30 epochs for best results. epochs >50 is expoentially less improvement.
training_args = TrainingArguments(
    output_dir='./results',              # direc to save model checkpoints
    num_train_epochs=3,
    per_device_train_batch_size=2,       # adjust based on GPU memory in gb of vram
    gradient_accumulation_steps=1,       # gradient descent steps
    evaluation_strategy="epoch",         # eval each epoch
    logging_dir='./logs',                 # store logs
    logging_steps=10,
    fp16=True,                            # only use mixed precision if avalaible
    save_total_limit=2,                  # only keep latest 2 models
    load_best_model_at_end=True,         # load best model when finished for traning
    dataloader_num_workers=4,            # number of sub-processes
)
# initialise the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [None]:
trainer.train() # begin TRANING THE AI

In [None]:
def generate_question(prompt): # generate questions
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# using photosyntheiss example from before
subject_topic = "Explain the process of photosynthesis."
generated_question = generate_question(subject_topic)
print(f"Generated Question: {generated_question}")

In [None]:
def generate_markscheme(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_length=150, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# example ms
generated_markscheme = generate_markscheme(generated_question)
print(f"Generated Mark Scheme: {generated_markscheme}")