<a href="https://colab.research.google.com/github/ScreanReaderAI/ScreenReaderAIBackend/blob/main/JAWS_Command.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets accelerate huggingface_hub
!huggingface-cli login


from transformers import GPT2TokenizerFast, Trainer, TrainingArguments, GPT2LMHeadModel, DataCollatorForLanguageModeling
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import Dataset
import pandas as pd

# Step 1: Define the DataFrame with your dataset
data = {
    "Category": ["Web Browsing", "Document Editing", "File Management"],
    "Software": ["Chrome", "Word", "Windows Explorer"],
    "Question": [
        "How do I open a new tab in Chrome?",
        "How do I format text in Word?",
        "How do I create a new folder?"
    ],
    "Step-by-Step Instructions": [
        "Press CTRL+T to open a new tab.",
        "Press ALT+H to access the Home ribbon, then F for font options.",
        "Press CTRL+SHIFT+N to create a new folder."
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Prepare the Data
class JAWSDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'labels': self.labels[idx]
        }

def prepare_data(df):
    # Load GPT-2 tokenizer
    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

    # Ensure the pad token is properly set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    input_ids_list = []
    labels_list = []

    # Tokenize the data
    for _, row in df.iterrows():
        input_text = row['Question']
        output_text = row['Step-by-Step Instructions']

        input_ids = tokenizer.encode(input_text, add_special_tokens=True)
        output_ids = tokenizer.encode(output_text, add_special_tokens=True)

        input_ids_list.append(torch.tensor(input_ids, dtype=torch.long))
        labels_list.append(torch.tensor(output_ids, dtype=torch.long))

    # Pad the sequences so they all have the same length
    input_ids_padded = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels_list, batch_first=True, padding_value=-100)  # Padding label with -100 for loss calculation

    return input_ids_padded, labels_padded, tokenizer

# Step 3: Prepare Dataset and Trainer
if __name__ == "__main__":
    input_ids_padded, labels_padded, tokenizer = prepare_data(df)
    dataset = JAWSDataset(input_ids_padded, labels_padded)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir="./logs",
        logging_steps=10,  # Log every 10 steps
        save_steps=50      # Save checkpoint every 50 steps
    )

    # Load pre-trained GPT-2 model
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Define a data collator that will dynamically pad the inputs and labels for each batch
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We are not using masked language modeling, as we are training GPT-2
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator
    )

    # Start training
    trainer.train()


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:

In [None]:
        !pip install transformers datasets accelerate huggingface_hub
!huggingface-cli login


from transformers import GPT2TokenizerFast, Trainer, TrainingArguments, GPT2LMHeadModel, DataCollatorForLanguageModeling
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import Dataset
import pandas as pd

# Step 1: Define the DataFrame with your dataset
data = {
    "Category": ["Web Browsing", "Document Editing", "File Management"],
    "Software": ["Chrome", "Word", "Windows Explorer"],
    "Question": [
        "How do I open a new tab in Chrome?",
        "How do I format text in Word?",
        "How do I create a new folder?"
    ],
    "Step-by-Step Instructions": [
        "Press CTRL+T to open a new tab.",
        "Press ALT+H to access the Home ribbon, then F for font options.",
        "Press CTRL+SHIFT+N to create a new folder."
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Prepare the Data
class JAWSDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'labels': self.labels[idx]
        }

def prepare_data(df):
    # Load GPT-2 tokenizer
    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

    # Ensure the pad token is properly set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    input_ids_list = []
    labels_list = []

    # Tokenize the data
    for _, row in df.iterrows():
        input_text = row['Question']
        output_text = row['Step-by-Step Instructions']

        input_ids = tokenizer.encode(input_text, add_special_tokens=True)
        output_ids = tokenizer.encode(output_text, add_special_tokens=True)

        input_ids_list.append(torch.tensor(input_ids, dtype=torch.long))
        labels_list.append(torch.tensor(output_ids, dtype=torch.long))

    # Pad the sequences so they all have the same length
    input_ids_padded = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels_list, batch_first=True, padding_value=-100)  # Padding label with -100 for loss calculation

    return input_ids_padded, labels_padded, tokenizer

# Step 3: Prepare Dataset and Trainer
if __name__ == "__main__":
    input_ids_padded, labels_padded, tokenizer = prepare_data(df)
    dataset = JAWSDataset(input_ids_padded, labels_padded)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir="./logs",
        logging_steps=10,  # Log every 10 steps
        save_steps=50      # Save checkpoint every 50 steps
    )

    # Load pre-trained GPT-2 model
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Define a data collator that will dynamically pad the inputs and labels for each batch
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We are not using masked language modeling, as we are training GPT-2
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator
    )

    # Start training
    model.save_pretrained("./trained_jaws_model")




In [3]:
# Save the trained model and tokenizer
model.save_pretrained("/content/finai_model")
tokenizer.save_pretrained("/content/finai_model")

# Compress the saved model files into a .zip
!zip -r finai_model.zip /content/finai_model

# Download the zipped model file to your local computer
from google.colab import files
files.download("finai_model.zip")


  adding: content/finai_model/ (stored 0%)
  adding: content/finai_model/merges.txt (deflated 53%)
  adding: content/finai_model/config.json (deflated 52%)
  adding: content/finai_model/model.safetensors


zip error: Interrupted (aborting)


FileNotFoundError: Cannot find file: finai_model.zip