In [13]:
from transformers import AutoTokenizer, CLIPTextModel
from PIL import Image
import os
import numpy as np
import torch
import pandas as pd

In [6]:
base_path = r"D:\Vijay\NYU\Spring_25\BDMLS\Project\dataset\ZuCo\2urht\osfstorage\task_materials"

# Load all 7 csv files
all_paths = os.listdir(base_path)
dfs = {}

for i in all_paths:
    if i.startswith('nr') and not i.endswith('control_questions.csv'):
        df = pd.read_csv(os.path.join(base_path, i), sep=';', header=None, names=['sent_id', 'par_id', 'sent', 'control_flag'])
        dfs[i] = df

In [7]:
dfs.keys()

dict_keys(['nr_1.csv', 'nr_2.csv', 'nr_3.csv', 'nr_4.csv', 'nr_5.csv', 'nr_6.csv', 'nr_7.csv'])

In [8]:
dfs['nr_1.csv'].head(5)

Unnamed: 0,sent_id,par_id,sent,control_flag
0,1,1,"Henry Ford (July 30, 1863 - April 7, 1947) was...",
1,2,1,"Henry Ford, with eleven other investors and $2...",CONTROL
2,3,1,"On January 1, 1919, after unsuccessfully seeki...",
3,4,1,"Henry Ford, with his son Edsel, founded the Fo...",
4,5,1,"After this initial success, Ford left Edison I...",


In [15]:
# Combine all dataframes into one, basically append all rows one below the other
df = pd.concat(dfs.values(), ignore_index=True)
df.head(5)
df.shape

(370, 4)

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Using device:", device)

cache_dir = r"D:\Vijay\NYU\Spring_25\BDMLS\Project\code\image_embedding_vit\clip_cache"

model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=cache_dir, device_map='auto')
processor = AutoTokenizer.from_pretrained('openai/clip-vit-base-patch32', cache_dir=cache_dir)

Using device: cuda


In [16]:
inputs = processor(df['sent'].tolist(), padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
    text_embeddings = outputs.pooler_output.cpu().numpy()

# Save the text embeddings to a file
save_path = r"D:\Vijay\NYU\Spring_25\BDMLS\Project\code\text_embedding"
os.makedirs(save_path, exist_ok=True)

np.save(os.path.join(save_path, 'text_embeddings.npy'), text_embeddings)

In [17]:
# Check the shape of the text embeddings
text_embeddings.shape

(370, 512)