In [None]:
# prompt: load in 10 images locally, then compute the cosine similarity between their embeddings and the text embedding for "white cat, black dog" after loading in CLIP. These values will be the x axis. Then compute the cosine similarity with the images and the text embedding for "white dog, black cat", this will be the y axis. Then create a scatterplot where each point corresponds to one image. if the image name has wcbd in it, make the points red. if the image name has wdbc in it, make it blue

from transformers import CLIPProcessor, CLIPModel
import torch
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import os

# Assuming images are in a directory named 'images' in the current working directory
image_files = [f"img{k}_wcbd.png" for k in range(1,11)] + [f"img{k}_wdbc.png" for k in range(1,11)]
image_dir = os.getcwd()
images = []
for i in range(len(image_files)):
    try:
      img_path = os.path.join(image_dir, image_files[i])
      img = Image.open(img_path)
      images.append(img)
    except Exception as e:
      print(f"Error loading image {image_files[i]}: {e}")

def plot_clip_final_scatter(images,texts):
  """ image_files should have first 10 images corresponding to texts[0], and next 10 images for texts[1]"""



  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
  clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

  # texts = ["white cat, black dog", "white dog, black cat"]

  x_axis = []
  y_axis = []

  with torch.no_grad():
    for img in images:
      image_inputs = clip_processor(images=img, return_tensors="pt").to(device)
      image_emb = clip_model.get_image_features(**image_inputs).cpu().numpy()

      for text in texts:
          text_inputs = clip_processor(text=text, return_tensors="pt").to(device)
          text_emb = clip_model.get_text_features(**text_inputs).cpu().numpy()
          similarity = cosine_similarity(image_emb, text_emb)[0][0]
          if text == texts[0]:#"white cat, black dog":
            x_axis.append(similarity)
          elif text == texts[1]:#"white dog, black cat":
            y_axis.append(similarity)

  # Create scatter plot
  plt.figure(figsize=(8, 6))

  colors = []
  for image_file in image_files:
    if "wcbd" in image_file:
      colors.append('red')
    elif "wdbc" in image_file:
      colors.append('blue')
    else:
      colors.append('gray')  # Default color for other images

  plt.scatter(x_axis, y_axis, c=colors[:len(x_axis)])
  plt.xlabel(f"Cosine Similarity with '{texts[0]}'")
  plt.ylabel(f"Cosine Similarity with '{texts[1]}")
  plt.title("Image Similarity to Text Embeddings")

  # # Add image names as labels (optional)
  # for i, image_file in enumerate(image_files[:len(x_axis)]):
  #   plt.annotate(image_file, (x_axis[i], y_axis[i]))

  plt.show()


In [None]:
# prompt: write a function that takes in a list of 20 PIL images, then imshows the first 10 with a red outline, and then the second 10 with a blue outline.

from PIL import Image, ImageDraw

def plot_images_with_outlines(images):
    """
    Plots the first 10 images with a red outline, and the next 10 with a blue outline.

    Args:
        images: A list of 20 PIL Images.
    """
    if len(images) != 20:
        raise ValueError("The input list must contain exactly 20 PIL Images.")

    fig, axes = plt.subplots(2, 10, figsize=(20, 4))
    axes = axes.ravel()

    for i, img in enumerate(images):
        ax = axes[i]

        # Create a copy of the image to draw on
        img_copy = img.copy()
        draw = ImageDraw.Draw(img_copy)

        # Define outline color
        if i < 10:
          outline_color = "red"
        else:
          outline_color = "blue"
        
        # Calculate outline width based on image size
        width, height = img_copy.size
        outline_width = max(int(min(width, height) / 100), 1) # Adjust divisor for thicker/thinner outlines


        # Draw the outline
        draw.rectangle([(0, 0), (width - 1, height - 1)], outline=outline_color, width=outline_width)

        # Display the image
        ax.imshow(img_copy)
        ax.axis('off')

    plt.tight_layout()
    plt.show()


In [1]:
import torch
from PIL import Image
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPImageProcessor
import numpy as np
import torch.nn.functional as F
import os
import torch.nn as nn



class CrossModalConvNetwork(nn.Module):
    def __init__(self, img_seq, text_seq, hidden_dim=128, dropout_prob=0.5):
        super(CrossModalConvNetwork, self).__init__()

        # 1x1 Convolutions to process cross-modal interactions
        self.conv1 = nn.Conv2d(
            in_channels=1, out_channels=hidden_dim // 32, kernel_size=(3, 3), padding = 1
        )
        self.conv2 = nn.Conv2d(
            in_channels=hidden_dim // 32, out_channels=hidden_dim //32, kernel_size=(3,3), padding = 1
        )

        # Dropout layers for regularization
        self.dropout1 = nn.Dropout2d(p=dropout_prob)
        self.dropout2 = nn.Dropout2d(p=dropout_prob)

        # Linear layers to reduce the sequence dimension and produce the final output
        self.fc1 = nn.Linear(hidden_dim // 32 * text_seq * img_seq, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

        # Dropout layers for fully connected layers
        self.dropout3 = nn.Dropout(p=dropout_prob)
        self.dropout4 = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        """
        x: Input tensor of shape (batch_size, batch_size, img_seq, text_seq)
        """
        batch_size = x.size(0)
        batch_size1 = x.size(1)

        # Add a channel dimension for the 1x1 convolutions: (bs, bs, 1, img_seq, text_seq)
        x = x.unsqueeze(2)

        # Apply 1x1 convolutions to the image-text interactions
        x = x.reshape(batch_size * batch_size1, 1, x.size(3), x.size(4))
        x = self.conv1(x)  # Output shape: (bs, bs, hidden_dim, img_seq, text_seq)
        x = torch.relu(x)
        x = self.dropout1(x)  # Dropout after first convolution
        x = self.conv2(x)  # Output shape: (bs, bs, hidden_dim, img_seq, text_seq)
        x = torch.relu(x)
        x = self.dropout2(x)  # Dropout after second convolution

        # Flatten the image and text sequence dimensions: (bs, bs, hidden_dim * img_seq * text_seq)
        x = x.view(batch_size * batch_size1, -1)

        # Apply linear layers to reduce the dimensions and aggregate information
        x = self.fc1(x)  # Output shape: (bs, bs, hidden_dim)
        x = torch.relu(x)
        x = self.dropout3(x)  # Dropout after first fully connected layer
        x = self.fc2(x)  # Output shape: (bs, bs, 1)
        x = self.dropout4(x)  # Dropout after final fully connected layer

        x = x.view(batch_size, batch_size1, 1)

        return x


def standardize(tensor):
    mean = tensor.mean()
    std = tensor.std()
    return (tensor - mean) / std


def do_everything(indiv_imgs, texts
):
    """
    here, we first try not replacing any of the text tokens. So no <special tokens>
    """

    # STEP 1:
    # Load the pre-trained CLIP model and processor
    device = "cuda"
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
    model = model.eval()
    model = model.to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
    img_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")

    test_special_words = [
        "above",
        "below",
        "left",
        "right",
       "small", "big", "not", "no", "without", "but","absent"]

    external_logic_token_embedding = nn.Embedding(
        len(test_special_words), 14 * 14 + 1
    ).to(device)

    with torch.no_grad():
        external_logic_token_embedding.weight = nn.Parameter(
            (F.normalize(external_logic_token_embedding.weight, p=2, dim=1) + 1) / 3
        )

    # Set up network n optimizer
    image_seq = 14 * 14 + 1
    text_seq = 30
    hidden_dim = 128

    # Initialize the network
    task_network = CrossModalConvNetwork(
        img_seq=image_seq,
        text_seq=text_seq,
        hidden_dim=hidden_dim,
        dropout_prob=0,
    ).to(device)
    criterion = nn.CrossEntropyLoss()


    pretrain_model_dir = r"D:\Ideal_CLIP\Ideal-CLIP-DCSM-private\pretrained_models\dcsm_obja_ckpt.pt"

    loaded = torch.load(pretrain_model_dir,weights_only=False)
    if task_network is not None:
        task_network.load_state_dict(loaded["model_state_dict"])

    task_network.eval()
    output_stack = []
    for pp in range(4):
      images = [Image.open(ii).convert("RGB") for ii in indiv_imgs[pp*5:pp*5 + 5]]
      


      # Extract patch and token level embeddings
      vision_inputs = img_processor(
          images=images, return_tensors="pt"
      ).to(device)
      vision_outputs = model.vision_model(**vision_inputs)
      last_hidden_state = (
          vision_outputs.last_hidden_state
      )  # Shape: [batch_size, num_patches+1, hidden_size]
      post_layernorm = model.vision_model.post_layernorm
      lh1 = post_layernorm(last_hidden_state)
      lh2 = model.visual_projection(lh1)
      image_features = lh2 / torch.norm(lh2, p=2, dim=-1, keepdim=True)


      text_tokens = tokenizer(
          text=texts,
          padding="max_length",
          max_length=text_seq,
          truncation=True,
          return_tensors="pt",
      ).to(device)
      text_features = model.text_model(
          **text_tokens
      ).last_hidden_state
      text_features = model.text_projection(text_features)
      text_features = text_features / text_features.norm(
          dim=-1, keepdim=True
      )

      image_features = image_features.unsqueeze(1)
      # Shape: (batch_size, 1, iseq, embed_dim)
      text_features = text_features.unsqueeze(0)
      # Shape: (1, batch_size, tseq, embed_dim)

      cossim_mat = torch.einsum(
          "bqie,lpte->bpit", image_features, text_features
      ).to(device)


      new_cossim_mat = []
      for k in range(len(texts)):
          # 0th dim is # of images. 1st dim is # of texts. so we can index in that dimension here.
          # we want to replace
          # add the special embedding to the correct location in the final output.
          sent = texts[k].split(" ")
          # find places where kth sentence has special words.
          temp_k_cossim = cossim_mat[:, k, :, :]

          # num_valid = len(sent)

          for sp_w in test_special_words:
              if sp_w in sent:
                  special_location = sent.index(sp_w)
                  chosen_w = test_special_words.index(sp_w)
                  mid = external_logic_token_embedding(
                      torch.tensor([chosen_w]).to(model.device)
                  )
                  mid = mid.unsqueeze(2).repeat(len(images), 1, 1)
                  temp_k_cossim = torch.concat(
                      [
                          temp_k_cossim[:, :, :special_location],
                          mid,
                          temp_k_cossim[:, :, special_location + 1 :],
                      ],
                      dim=-1,
                  )

          new_cossim_mat.append(temp_k_cossim)

          # ah ok. I need to broadcast the external logic token through the image dimension.

      cossim_mat = torch.stack(new_cossim_mat, dim=1)
      # this is now shape bs_i x bs_t x i_seq x t_seq+1 , theoretically.

      # let's imshow to confirm
      cossim_mat = standardize(cossim_mat)


      quicklabels = torch.eye(cossim_mat.shape[0]).to(device)
      task_output = (
      task_network(cossim_mat).to(device).squeeze()
      )  # shape bs x bs x 1.
      print(task_output)
      output_stack.append(task_output)

    return output_stack




: 

In [None]:
indiv_imgs = [f"/content/img{k}_wcbd.png" for k in range(1,11)] + [f"img{k}_wdbc.png" for k in range(1,11)]
texts = ["white dog, black cat", "black dog, white cat"]

torch.manual_seed(15)
output_stack = do_everything(indiv_imgs, texts)

output_tensor = torch.cat(output_stack, 0)

plt.figure(figsize=(8, 6))
plt.scatter(output_tensor[:10,1].detach().cpu().numpy(), output_tensor[:10,0].detach().cpu().numpy(), c="blue", label="wcbd")
plt.scatter(output_tensor[10:,1].detach().cpu().numpy(), output_tensor[10:,0].detach().cpu().numpy(), c="red", label="wdbc")
plt.xlabel("Cosine Similarity ('white cat, black dog')")
plt.ylabel("Cosine Similarity ('white dog, black cat')")
plt.title("Image Similarity to Text Embeddings")

In [None]:
output_tensor = torch.cat(output_stack, 0)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(output_tensor[:10,1].detach().cpu().numpy(), output_tensor[:10,0].detach().cpu().numpy(), c="blue", label="wcbd")
plt.scatter(output_tensor[10:,1].detach().cpu().numpy(), output_tensor[10:,0].detach().cpu().numpy(), c="red", label="wdbc")
plt.xlabel("Cosine Similarity ('white cat, black dog')")
plt.ylabel("Cosine Similarity ('white dog, black cat')")
plt.title("Image Similarity to Text Embeddings")

## Spatial Location

### spatial -- synthetic

In [None]:
coord_tuples = []
for jj in range(4):
  #the row along which the objects will be
  #so camel_0-0_frog_3-0 is camel left of frog
  for adder in range(1,4):
    for kk in range(2):
      if kk+adder < 4:
        coord_tuples.append((kk,jj,kk+adder,jj))
import random

random.shuffle(coord_tuples)
len(coord_tuples)