Experiment 1: Simply choosing MCQ of either pick or place

Experiment 2a: Generating an MCQ option with random objects across the dataset

Experiment 2a: Generating an MCQ option with objects that are within the frame

Training 1: Setting up a code to train the model

^Begin by focusing on the Phi3 model with its speed 
Within my dataset, the smallest is with 4 frames, select the first 4 frames (prefer earlier prediction)

In [1]:
import os
import pandas as pd
import requests
from io import BytesIO
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForCausalLM, AutoProcessor
from torchvision import transforms
from PIL import Image
import torch.optim as optim
import random
import wandb
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchvision.transforms.functional import resize, to_pil_image



In [2]:
class AriaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, image_size):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.tokenizer.padding_side = 'left'
        self.max_length = max_length
        self.image_size = image_size

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Validate row data
        if not all(col in row for col in ['action', 'annotation', 'action_class', 'Folder name']):
            print(f"Invalid row data at index {idx}: {row}")
            return None
        
        messages = [{"role": "user", "content": f"<|image_1|><|image_2|><|image_3|><|image_4|>Analyze this sequence of frames where the red spot shows "
            f"the user's eye gaze. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place.<|end|>"
            }]
        
        prompt = processor.tokenizer.apply_chat_template(
          messages, 
          tokenize=False, 
          add_generation_prompt=True
        )

        folder_path = os.path.join("/home/uril/hot3d/hot3d/dataset/mcq/all_frames", row['Folder name'])

        # Load images
        img = []
        try:
            files = sorted(os.listdir(folder_path))  # Ensure consistent ordering
            for file in files:
                if file.endswith('.jpg'):
                    image_path = os.path.join(folder_path, file)
                    image = Image.open(image_path).convert("RGB").resize((self.image_size, self.image_size))
                    img.append(image)
                    if len(img) == 4:
                        break  # Only take the first 4 images
        except Exception as e:
            print(f"Error accessing folder {folder_path}: {e}")
            return None
    
        if len(img) < 4:
            print(f"Insufficient images in folder {folder_path}, found {len(img)} images.")
            return None
            
        #pixel_values = torch.stack([processor.image_processor(image) for image in img])  # Add sequence dimension
        #image_sizes = torch.tensor([image.size for image in img])  # Size for each image
        try:
            pixel_values = torch.stack([processor.image_processor(image)["pixel_values"].squeeze(0) for image in img])  # Extract tensors
            image_sizes = torch.tensor([image.size for image in img])  # Store original sizes
        except Exception as e:
            print(f"Error processing images: {e}")
            return None

        inputs = processor(
            prompt,
            images=img,
            return_tensors="pt",
            padding=True
        )
        inputs['attention_mask'] = inputs['attention_mask']
        inputs['input_ids'] = inputs['input_ids']
        inputs['pixel_values'] = inputs['pixel_values']
        inputs['image_sizes'] = inputs['image_sizes']
        inputs['label'] = torch.tensor(row['action_class'], dtype=torch.long)
        
        return inputs

dataset_path = '/home/uril/hot3d/hot3d/dataset/mcq/pick_place_classification.csv'
df = pd.read_csv(dataset_path)


# Initialize processor and tokenizer
model_id = "microsoft/Phi-3.5-vision-instruct" 
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 
tokenizer = processor.tokenizer

# Split dataset into training and validation sets
train_size = int(0.9 * len(df))
val_size = len(df) - train_size
train_indices, val_indices = random_split(range(len(df)), [train_size, val_size])
train_indices = train_indices.indices
val_indices = val_indices.indices
train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]


# Create dataset and dataloader
train_dataset = AriaDataset(train_df, tokenizer, max_length=256, image_size=128)
val_dataset = AriaDataset(val_df, tokenizer, max_length=256, image_size=128)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)





In [3]:
torch.manual_seed(3)

run = wandb.init(project="pick-place-phi3")


# Initialize model
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='flash_attention_2'    
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
# Training loop
num_epochs = 10
eval_interval = 150  # Evaluate every 'eval_interval' steps
loss_scaling_factor = 1000.0  # Variable to scale the loss by a certain amount
save_dir = './saved_models'
step = 0
accumulation_steps = 64  # Accumulate gradients over this many steps


if not os.path.exists(save_dir):
    os.makedirs(save_dir)


best_val_loss = float('inf')
best_model_path = None


# Select 10 images from the validation set for logging
num_log_samples = 2
log_indices = random.sample(range(len(val_dataset)), num_log_samples)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mttyh[0m ([33mttyh-ucla[0m). Use [1m`wandb login --relogin`[0m to force relogin


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def evaluate(model, val_loader, device, tokenizer, step, log_indices, max_samples=None):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if max_samples and i >= max_samples:
                break
            if batch is None:
                continue
            
            action = batch['label'][0].long().to(device)
            del batch['label']
            inputs = inputs[0].to("cuda:0") 

            generate_ids = run_model(inputs, generation_args, model, processor)
            predicted_text = processor.batch_decode(generate_ids, 
              skip_special_tokens=True, 
              clean_up_tokenization_spaces=False)[0] 
            criterion = nn.CrossEntropyLoss()

            loss = criterion(predicted_text, str(action)) 

            total_loss += loss.item()
            total_correct += (predictions == action).sum().item()
            total_samples += action.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = total_correct / total_samples

    wandb.log({
        "Validation Loss": avg_loss,
        "Validation Accuracy": accuracy,
        "Step": step
    })

    model.train()
    return avg_loss, accuracy


In [5]:
def run_model(inputs, generation_args, model, processor):
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    return generate_ids


In [13]:
model.train()
for epoch in range(num_epochs):  # Number of epochs
    total_train_loss = 0
    total_correct = 0
    total_samples = 0
    batch_count = 0

    for batch in train_loader:
        step += 1
        if batch is None:  # Skip if the batch is None
            continue

        
        action = batch['label'][0].long().to(device)
        del batch['label']
        batch['attention_mask']=batch['attention_mask'][0]
        batch['input_ids']=batch['input_ids'][0]
        batch['pixel_values']=batch['pixel_values'][0]
        batch['image_sizes']=batch['image_sizes'][0]
        inputs = batch.to("cuda:0") 
        
        generation_args = { 
            "max_new_tokens": 1000, 
            "temperature": 0.0, 
            "do_sample": False, 
        } 
        print(inputs['attention_mask'].shape)
        print(inputs['input_ids'].shape)
        print(inputs['pixel_values'].shape)
        print(inputs['image_sizes'].shape)
        generate_ids = run_model(inputs, generation_args, model, processor)
        predicted_text = processor.batch_decode(generate_ids, 
          skip_special_tokens=True, 
          clean_up_tokenization_spaces=False)[0] 

        print('predicted_text: ', predicted_text)
        print('action: ', action)
        
        criterion = nn.CrossEntropyLoss()
        loss = criterion(torch.tensor(float(predicted_text)), torch.tensor(float(action))) 

        total_loss += loss.item()
        total_correct += (predictions == action).sum().item()
        total_samples += action.size(0)

        total_loss.backward()
        if (step % accumulation_steps) == 0:
            for param in model.parameters():
                if param.grad is not None:
                    param.grad /= accumulation_steps
            optimizer.step()
            optimizer.zero_grad()

        total_train_loss += total_loss.item()
        batch_count += 1

        wandb.log({
            "Batch Loss": total_loss.item(),
            "Batch Accuracy": correct_predictions / action.size(0),
            "Step": step
        })

        print(f"Epoch: {epoch}, Step: {step}, Batch Loss: {total_loss.item()}, Batch Accuracy: {correct_predictions / action.size(0)}")

        if step % eval_interval == 0:
            val_loss = evaluate(model, val_loader, device, tokenizer=tokenizer, log_indices=log_indices, step=step)
            wandb.log({
                "Validation Loss": val_loss,
                "Step": step
            })
            print(f"Step: {step}, Validation Loss: {val_loss}")

            # Save the best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_path = os.path.join(save_dir, f"best_model")
                model.save_pretrained(best_model_path, safe_serialization=False)
                tokenizer.save_pretrained(best_model_path)

    # Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / batch_count if batch_count > 0 else float('inf')
    train_accuracy = total_correct / total_samples if total_samples > 0 else 0


    wandb.log({
        "Epoch": epoch,
        "Average Training Loss": avg_train_loss,
        "Training Accuracy": train_accuracy,
    })

    print(f"Epoch: {epoch}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}")

    if best_model_path:
        run.log_model(
            path=best_model_path,
            name="phi3-v-aria",
            aliases=["best"],
        )

wandb.finish()

torch.Size([1, 3091])
torch.Size([1, 3091])
torch.Size([4, 5, 3, 336, 336])
torch.Size([4, 2])
predicted_text:  1
action:  tensor(1, device='cuda:0')


IndexError: Dimension specified as 1 but tensor has no dimensions

In [20]:
response = processor.batch_decode(generate_ids.sequences, 
  skip_special_tokens=True, 
  clean_up_tokenization_spaces=False)[0] 


OverflowError: out of range integral type conversion attempted

In [21]:
print(generate_ids)
print(generate_ids['past_key_values'])

GenerateDecoderOnlyOutput(sequences=tensor([[    1, 32010, 29871,  ..., 29896, 32007, 32000]], device='cuda:0'), scores=None, logits=None, attentions=None, hidden_states=None, past_key_values=((tensor([[[[-2.7344e-01, -3.8452e-03, -1.4709e-02,  ...,  2.1680e-01,
           -5.4688e-01, -3.3594e-01],
          [ 1.1035e-01,  1.9629e-01, -2.6953e-01,  ...,  2.3633e-01,
           -4.6875e-01, -3.0469e-01],
          [ 5.7031e-01,  2.8711e-01, -3.3984e-01,  ...,  1.8555e-01,
           -1.5000e+00, -5.8594e-01],
          ...,
          [ 5.7031e-01,  2.9688e-01,  2.7930e-01,  ...,  1.2500e-01,
           -1.4922e+00, -5.8594e-01],
          [ 4.9316e-02,  3.0078e-01,  2.3926e-01,  ..., -2.1484e-01,
           -5.8203e-01, -1.1377e-01],
          [-5.8594e-01, -4.9023e-01, -6.1035e-02,  ..., -9.1553e-03,
            1.9043e-01,  1.2598e-01]],

         [[ 1.0156e-01, -2.0801e-01, -2.5781e-01,  ...,  3.7994e-03,
            3.4424e-02,  1.6797e-01],
          [ 1.0938e-01, -2.9492e-01, -3.

In [58]:

# Forward pass
model.eval()

generation_args = { 
    "max_new_tokens": 1000, 
    "temperature": 0.0, 
    "do_sample": False, 
} 


with torch.no_grad():
    generate_ids = run_model(inputs, generation_args, model, processor)
    
# Decode predictions
predicted_text = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

# Display results
print("Ground Truth Action:", ground_truth_action)
print("Predicted Text:", predicted_text)



Ground Truth Action: tensor(0)
Predicted Text: 1


Did not work:


In [2]:
class AriaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, image_size):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.tokenizer.padding_side = 'left'
        self.max_length = max_length
        self.image_size = image_size

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Validate row data
        if not all(col in row for col in ['action', 'annotation', 'action_class', 'Folder name']):
            print(f"Invalid row data at index {idx}: {row}")
            return None
        
        text = (
            f"<|user|>\n<|image_1|><|image_2|><|image_3|><|image_4|>Analyze this sequence of frames where the red spot shows "
            f"the user's eye gaze. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place.<|end|>"
            f"<|assistant|>\n {row['action_class']}<|end|>"
        )

        folder_path = os.path.join("/home/uril/hot3d/hot3d/dataset/mcq/all_frames", row['Folder name'])

        # Tokenize text
        encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)

        # Load images
        img = []
        try:
            files = os.listdir(folder_path)
            for file in sorted(files)[:4]:  # Sort to ensure consistent ordering
                if file.endswith('.jpg'):
                    try:
                        image_path = os.path.join(folder_path, file)
                        image = Image.open(image_path).convert("RGB").resize((self.image_size, self.image_size))
                        img.append(np.array(image))
                    except Exception as e:
                        print(f"Error opening image {file} in {folder_path}: {e}")
        except (FileNotFoundError, IOError) as e:
            print(f"Error accessing folder {folder_path}: {e}")
            return None

        if len(img) < 4:
            print(f"Insufficient images in folder {folder_path}, found {len(img)} images.")
            return None

        encodings['pixel_values'] = torch.tensor(np.stack(img)).permute(0, 3, 1, 2)  # Convert to tensor
        encodings['task'] = torch.tensor(row['action_class'], dtype=torch.long)

        return {
            key: torch.tensor(val, dtype=torch.long) if isinstance(val, list) else torch.tensor(val)
            for key, val in encodings.items()
        }



In [3]:
dataset_path = '/home/uril/hot3d/hot3d/dataset/mcq/pick_place_classification.csv'
df = pd.read_csv(dataset_path)


# Initialize processor and tokenizer
model_id = "microsoft/Phi-3.5-vision-instruct" 
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
tokenizer = processor.tokenizer

# Split dataset into training and validation sets
train_size = int(0.9 * len(df))
val_size = len(df) - train_size
train_indices, val_indices = random_split(range(len(df)), [train_size, val_size])
train_indices = train_indices.indices
val_indices = val_indices.indices
train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]


# Create dataset and dataloader
train_dataset = AriaDataset(train_df, tokenizer, max_length=256, image_size=128)
val_dataset = AriaDataset(val_df, tokenizer, max_length=256, image_size=128)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)





In [4]:
def evaluate(model, val_loader, device, tokenizer, step, log_indices, max_samples=None):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if max_samples and i >= max_samples:
                break
            if batch is None:
                continue

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = input_ids.clone().detach()
            action = batch['task'].long().to(device)

            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                pixel_values=pixel_values, 
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits[:, -1, :]
            predictions = torch.argmax(logits, dim=-1)

            total_loss += loss.item()
            total_correct += (predictions == action).sum().item()
            total_samples += action.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = total_correct / total_samples

    wandb.log({
        "Validation Loss": avg_loss,
        "Validation Accuracy": accuracy,
        "Step": step
    })

    model.train()
    return avg_loss, accuracy


In [5]:
torch.manual_seed(3)

run = wandb.init(project="pick-place-phi3")


# Initialize model
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
# Training loop
num_epochs = 10
eval_interval = 150  # Evaluate every 'eval_interval' steps
loss_scaling_factor = 1000.0  # Variable to scale the loss by a certain amount
save_dir = './saved_models'
step = 0
accumulation_steps = 64  # Accumulate gradients over this many steps


if not os.path.exists(save_dir):
    os.makedirs(save_dir)


best_val_loss = float('inf')
best_model_path = None


# Select 10 images from the validation set for logging
num_log_samples = 2
log_indices = random.sample(range(len(val_dataset)), num_log_samples)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mttyh[0m ([33mttyh-ucla[0m). Use [1m`wandb login --relogin`[0m to force relogin


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def evaluate(model, val_loader, device, tokenizer, step, log_indices, max_samples=None, ):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    log_images = []
    log_gt_texts = []
    log_pred_texts = []
    table = wandb.Table(columns=["Image", "Ground Truth Text", "Predicted Text"])

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if max_samples and i >= max_samples:
                break


            if batch is None:  # Skip if the batch is None
                continue


            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = input_ids.clone().detach()
            action = batch['task'].item()


            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                pixel_values=pixel_values, 
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits[:, -1, :]
            predictions = torch.argmax(outputs.logits, dim=-1)
            print('logits: ', logits)
            print('predictions: ', predictions)
            
            correct_predictions = (predictions == action).sum().item()
            total_correct += correct_predictions
            total_samples += action.size(0)
            
            # Log images, ground truth texts, and predicted texts
            if i in log_indices:
                log_images.append(pixel_values.cpu().squeeze().numpy())
                log_gt_texts.append(tokenizer.decode(labels[0], skip_special_tokens=True))
                log_pred_texts.append(tokenizer.decode(predictions[0], skip_special_tokens=True))


                # Convert image to PIL format
                pil_img = to_pil_image(resize(torch.from_numpy(log_images[-1]).permute(2, 0, 1), (336, 336))).convert("RGB")
                
                # Add data to the table
                table.add_data(wandb.Image(pil_img), log_gt_texts[-1], log_pred_texts[-1])


                # Log the table incrementally
    
    wandb.log({"Evaluation Results step {}".format(step): table, "Step": step})
    avg_loss = total_loss / (i + 1)  # i+1 to account for the loop index
    val_accuracy = total_correct / total_samples if total_samples > 0 else 0

    wandb.log({
        "Validation Loss": avg_loss,
        "Validation Accuracy": val_accuracy,
        "Step": step
    })
    model.train()
    return avg_loss, val_accuracy

model.train()
for epoch in range(num_epochs):  # Number of epochs
    total_train_loss = 0
    total_correct = 0
    total_samples = 0
    batch_count = 0

    for batch in train_loader:
        step += 1
        if batch is None:  # Skip if the batch is None
            continue

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = input_ids.clone().detach()
        action = batch['task'].float().to(device)

        outputs = model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            pixel_values=pixel_values, 
            labels=labels
        )
        loss = outputs.loss
        total_loss = loss
        logits = outputs.logits[:, -1, :] 
        predictions = torch.argmax(outputs.logits, dim=-1)            

        correct_predictions = (predictions == action).sum().item()
        total_correct += correct_predictions
        total_samples += action.size(0)
        
        total_loss.backward()
        if (step % accumulation_steps) == 0:
            for param in model.parameters():
                if param.grad is not None:
                    param.grad /= accumulation_steps
            optimizer.step()
            optimizer.zero_grad()

        total_train_loss += total_loss.item()
        batch_count += 1

        wandb.log({
            "Batch Loss": total_loss.item(),
            "Batch Accuracy": correct_predictions / action.size(0),
            "Step": step
        })

        print(f"Epoch: {epoch}, Step: {step}, Batch Loss: {total_loss.item()}, Batch Accuracy: {correct_predictions / action.size(0)}")

        if step % eval_interval == 0:
            val_loss = evaluate(model, val_loader, device, tokenizer=tokenizer, log_indices=log_indices, step=step)
            wandb.log({
                "Validation Loss": val_loss,
                "Step": step
            })
            print(f"Step: {step}, Validation Loss: {val_loss}")

            # Save the best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_path = os.path.join(save_dir, f"best_model")
                model.save_pretrained(best_model_path, safe_serialization=False)
                tokenizer.save_pretrained(best_model_path)

    # Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / batch_count if batch_count > 0 else float('inf')
    train_accuracy = total_correct / total_samples if total_samples > 0 else 0


    wandb.log({
        "Epoch": epoch,
        "Average Training Loss": avg_train_loss,
        "Training Accuracy": train_accuracy,
    })

    print(f"Epoch: {epoch}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}")

    if best_model_path:
        run.log_model(
            path=best_model_path,
            name="phi3-v-aria",
            aliases=["best"],
        )

wandb.finish()


  key: torch.tensor(val, dtype=torch.long) if isinstance(val, list) else torch.tensor(val)


Epoch: 0, Step: 1, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 2, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 3, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 4, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 5, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 6, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 7, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 8, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 9, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 10, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 11, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 12, Batch Loss: 10.193565368652344, Batch Accuracy: 0.0
Epoch: 0, Step: 13, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
Epoch: 0, Step: 14, Batch Loss: 10.191393852233887, Batch Accuracy: 0.0
E

In [None]:
#Checking for a single pass to see what the output is since the accuracy is 0. 
# Forward pass
model.eval()
with torch.no_grad():
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pixel_values=pixel_values,
    )

# Process logits to get predictions
logits = outputs.logits[:, -1, :]  # Get last token's logits
predictions = torch.argmax(logits, dim=-1)

# Decode predictions
predicted_text = tokenizer.decode(predictions[0], skip_special_tokens=True)

# Display results
print("Ground Truth Action:", ground_truth_action)
print("Predicted Text:", predicted_text)

# Optional: Display the input image
image_tensor = torch.tensor(example['pixel_values'])
for img in image_tensor:
    image = ToPILImage()(img)
    image.show()


In [31]:
predictions = torch.argmax(outputs.logits, dim=-1)
predicted_text = tokenizer.decode(predictions[0], skip_special_tokens=True)

In [32]:
predicted_text

"is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is isícioício1\nnouser|description|>|image_2|>|image_3|>|image_4|>\nalyze the image of images from each first dot is a pro's hand,e direction Theify the the user is looking a task- a action.\n 'pick' if the' aing '1' if it is place.\n \n"

In [None]:
#EVALUATION FROM THE WEBSITE WHICHH I HAVE YET TO TRY TO IMPLEMENT
import weave
import os
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import requests
from io import BytesIO
import base64
from pathlib import Path
import wandb 


# Initialize Weights & Biases run
run = wandb.init(project='burberry-product-price-prediction')
artifact = run.use_artifact('byyoung3/model-registry/phi3-v-burberry:v0', type='model')
artifact_dir = artifact.download()
print(f"Artifact downloaded to: {artifact_dir}")


model_id = "microsoft/Phi-3-vision-128k-instruct" 


try:
    model = AutoModelForCausalLM.from_pretrained(
        artifact_dir, 
        torch_dtype=torch.float16, 
        attn_implementation="flash_attention_2",
        trust_remote_code=True
    )
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
except Exception as e:
    print(f"Error loading model or processor: {e}")
    raise


# Ensure the model is on the correct device
device = 'cuda'
model.to(device)
# Function to run inference on a single image
@weave.op
def run_inference(image_url: str) -> dict:
    try:
        prompt = "<|user|>\n<|image_1|>What is shown in this image?<|end|><|assistant|>\n"        
        # Load image
        image = Image.open(requests.get(image_url, stream=True).raw)
        inputs = processor(prompt, [image], return_tensors="pt").to(device)
        generation_args = { 
            "max_new_tokens": 500, 
            "temperature": 0.0, 
            "do_sample": False, 
        } 


        generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) 


        # Remove input tokens 
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response_text = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 


        return {
            "predicted_text": response_text,
            "image": image
        }


    except Exception as e:
        print(f"Error during inference: {e}")
        raise


# Initialize Weave project
weave.init('burberry-product-price-prediction')


# Example usage
image_url = "https://assets.burberry.com/is/image/Burberryltd/1C09D316-7A71-472C-8877-91CEFBDB268A?$BBY_V3_SL_1$&wid=1501&hei=1500"
try:
    result = run_inference(image_url)
    print("Predicted Text:", result['predicted_text'])
except Exception as e:
    print(f"Error running inference: {e}")
