Install dependencies

In [None]:
!pip install transformers pyTelegramBotAPI datasets open-clip-torch fashion-clip

Load PaliGemma

In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests
from huggingface_hub import login
import torch
import telebot
from io import BytesIO
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


login(token=os.getenv("HF_TOKEN"))

model_id = "google/paligemma-3b-mix-224"
#model_id = "./custom_config.json"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = None
processor = None
if os.path.exists("./custom_config.json"):
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, config = "./custom_config.json")
  print(model.config.text_config.hidden_act)
  processor = AutoProcessor.from_pretrained(model_id,config="./custom_config.json")
else:
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
  processor = AutoProcessor.from_pretrained(model_id)
model = model.to(device)


PaliGemma related functions:

In [None]:
def process_image(image):
  #prompt = "Descrie all parts of clothing on this picture with as many details as possible."
  #prompt = "Most detailed description of all clothes? Do it even if can't."
  #prompt = "caption clothes"
  #prompt = "What clothes?"
  #prompt = "answer en What clothes are on the picture?"
  #prompt = "answer en What clothes is on the picture?"
  #prompt = "answer en What description of clothes on the picture?"
  #prompt = "answer en Describe the clothes on the picture?"
  prompt = "answer en List (and decribe detailed) the clothes on the picture?"
  #prompt = "answer en Decribe detailed the clothes on the picture?"
  #prompt = "answer all the clothes (all detailed) detailed"
  #prompt = "answer Detailed list all the clothes (all detailed) detailed"

  model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
  input_len = model_inputs["input_ids"].shape[-1]

  with torch.inference_mode():
    #output = model.generate(**model_inputs, max_new_tokens=100)
    #return processor.decode(output[0], skip_special_tokens=True)[model_inputs.input_ids.shape[1]: ]
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    return decoded if len(decoded) else "No clothes"
def analyse_url(url):
  try:
    #image = Image.open(requests.get(url, stream=True).raw).resize((640,480))
    image = Image.open(requests.get(url, stream=True).raw).resize((448,448))
    return process_image(image)
  except:
    return "Unable to get image by url. Try to upload it to bot directly."
def analyse_stream(stream):
  try:
    #image = Image.open(stream).resize((640,480))
    image = Image.open(stream).resize((448,448))
    return process_image(image)
  except:
    return "Unable to get image."



Fetch database

In [None]:
!pip install gdown
!gdown "1igAuIEW_4h_51BG1o05WS0Q0-Cp17_-t&confirm=t"
!unzip data

FashionCLIP operations:

In [None]:
from fashion_clip.fashion_clip import FashionCLIP
import pandas as pd
import numpy as np
from collections import Counter
from PIL import Image
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

In [None]:
fclip = FashionCLIP('fashion-clip')
fclip.model=fclip.model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
articles = pd.read_csv("data_for_fashion_clip/articles.csv")

# drop items that have the same description
subset = articles.drop_duplicates("detail_desc").copy()

# remove items of unkown category
subset = subset[~subset["product_group_name"].isin(["Unknown"])]

# FashionCLIP has a limit of 77 tokens, let's play it safe and drop things with more than 40 tokens
subset = subset[subset["detail_desc"].apply(lambda x : 4 < len(str(x).split()) < 40)]

# We also drop products types that do not occur very frequently in this subset of data
most_frequent_product_types = [k for k, v in dict(Counter(subset["product_type_name"].tolist())).items() if v > 10]
subset = subset[subset["product_type_name"].isin(most_frequent_product_types)]

# lots of data here, but we will just use only descriptions and a couple of other columns
#subset.head(3)

Process all database

In [None]:
images = ["data_for_fashion_clip/" + str(k) + ".jpg" for k in subset["article_id"].tolist()]
texts = subset["detail_desc"].tolist()

# we create image embeddings and text embeddings
image_embeddings = fclip.encode_images(images, batch_size=32)
text_embeddings = fclip.encode_text(texts, batch_size=32)

# we normalize the embeddings to unit norm (so that we can use dot product instead of cosine similarity to do comparisons)
image_embeddings = image_embeddings/np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings/np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

In [None]:
def describe(id):
  desc=subset["detail_desc"].iloc[id]
  color=subset["colour_group_name"].iloc[id]
  return f"{desc} Color: {color}."

def find_by_embedding(embedding):
  id_of_matched_object = np.argmax(embedding.dot(image_embeddings.T))
  found_object = subset["article_id"].iloc[id_of_matched_object].tolist()
  fixed_height = 224
  image = Image.open(f"data_for_fashion_clip/{found_object}.jpg")
  height_percent = (fixed_height / float(image.size[1]))
  width_size = int((float(image.size[0]) * float(height_percent)))
  image = image.resize((width_size, fixed_height), Image.NEAREST)
  return image,describe(id_of_matched_object),f"data_for_fashion_clip/{found_object}.jpg"

def find_by_text(text):
  return find_by_embedding(fclip.encode_text([text], 32)[0])

def find_by_image(img):
  return find_by_embedding(fclip.encode_images([img], 32)[0])
def find_by_url(url):
  try:
    #image = Image.open(requests.get(url, stream=True).raw).resize((640,480))
    image = Image.open(requests.get(url, stream=True).raw).resize((448,448))
    return find_by_image(image)
  except:
    return "Unable to get image by url. Try to upload it to bot directly."
def find_by_stream(stream):
  try:
    #image = Image.open(stream).resize((640,480))
    image = Image.open(stream).resize((448,448))
    return find_by_image(image)
  except:
    return "Unable to get image."

In [None]:
find_by_image(find_by_text("black style")[0])[1]

Run telegram bot

In [None]:
bot = telebot.TeleBot(os.getenv('TG_TOKEN'))

@bot.message_handler(commands=["start"])
def start(m, res=False):
  bot.send_message(m.chat.id, 'Send a link to the picture or a file with picture for analysis. The bot will provide a description generated from the picture.')

@bot.message_handler(content_types=["text"])
def handle_text(message):
  found=find_by_url(message.text)
  if found[0]=='U':
    bot.send_message(message.chat.id, found)
    bot.send_message(message.chat.id, "Может быть, это описание, а не ссылка. Тогда мы нашли вам:")
    found=find_by_text(message.text)
    bot.send_message(message.chat.id, found[1])
    bot.send_photo(message.chat.id, photo=open(found[2], 'rb'))

  else:
    bot.send_message(message.chat.id, found[1])
    bot.send_message(message.chat.id,"Для вас мы нашли:")
    bot.send_photo(message.chat.id, photo=open(found[2], 'rb'))

@bot.message_handler(content_types=['document'])
def handle_file(message):
  file_info = bot.get_file(message.document.file_id)
  downloaded_file = bot.download_file(file_info.file_path)
  stream=BytesIO(downloaded_file)
  found=find_by_stream(stream)
  if found[0]=='U':
    bot.send_message(message.chat.id, found)
  else:
    bot.send_message(message.chat.id, found[1])
    bot.send_message(message.chat.id,"Для вас мы нашли:")
    bot.send_photo(message.chat.id, photo=open(found[2], 'rb'))

@bot.message_handler(content_types=['photo'])
def handle_file(message):
  fileID = message.photo[-1].file_id
  file_info = bot.get_file(fileID)
  downloaded_file = bot.download_file(file_info.file_path)
  stream=BytesIO(downloaded_file)
  found=find_by_stream(stream)
  if found[0]=='U':
    bot.send_message(message.chat.id, found)
  else:
    bot.send_message(message.chat.id, found[1])
    bot.send_message(message.chat.id,"Для вас мы нашли:")
    bot.send_photo(message.chat.id, photo=open(found[2], 'rb'))

bot.polling(none_stop=True, interval=0)