# Data Prep

Code authored by: Shaw Talebi 

[Blog link](https://medium.com/towards-data-science/multimodal-rag-process-any-file-type-with-ai-e6921342c903) 
| [Video link](https://youtu.be/Y7pNmocrmi8)

### imports

In [1]:
from bs4 import BeautifulSoup
import os
from functions import *

from PIL import Image
from transformers import CLIPProcessor, CLIPModel

from torch import cat, save

### Extract text and images

In [2]:
# Get all HTML files from raw directory
filename_list = ["raw/"+f for f in os.listdir('raw')]

text_content_list = []
image_content_list = []
for filename in filename_list:

    with open(filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    text_content_list.extend(parse_html_content(html_content))
    image_content_list.extend(parse_html_images(html_content))

In [3]:
print(len(text_content_list))
print(len(image_content_list))

86
17


In [4]:
text_list = []
for content in text_content_list:
    # concatenate title and section header
    section = content['section'] + ": "
    # append text from paragraph to fill CLIP's 256 sequence limit
    text = section + content['text'][:256-len(section)]
    
    text_list.append(text)

image_list = []
for content in image_content_list:
    image_list.append(Image.open(content['image_path']))

In [5]:
print(len(text_list))
print(len(image_list))

86
17


### Compute embeddings using CLIP

In [6]:
# import model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# import processor (handles text tokenization and image preprocessing)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") 

In [7]:
# pre-process text and images
inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True)

In [8]:
# compute embeddings with CLIP
outputs = model(**inputs)

In [9]:
# store embeddings in single torch tensor
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds

In [10]:
print(text_embeddings.shape)
print(image_embeddings.shape)

torch.Size([86, 512])
torch.Size([17, 512])


### Save Data

In [11]:
# save content list as JSON
save_to_json(text_content_list, output_file='data/text_content.json')
save_to_json(image_content_list, output_file='data/image_content.json')

In [12]:
# save embeddings to file
save(text_embeddings, 'data/text_embeddings.pt')
save(image_embeddings, 'data/image_embeddings.pt')