In [None]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

In [None]:
! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

In [None]:
import numpy as np
import torch

print("Torch version:", torch.__version__)

In [None]:
!pip install exifread

In [None]:
! pip install imagehash

In [None]:
import imagehash

In [None]:
MODELS = {
    "ViT-B/32":       "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
}

In [None]:
! wget {MODELS["ViT-B/32"]} -O model.pt

In [None]:
model = torch.jit.load("model.pt").cuda().eval()
input_resolution = model.input_resolution.item()
context_length = model.context_length.item()
vocab_size = model.vocab_size.item()

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [None]:
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image

preprocess = Compose([
    Resize(input_resolution, interpolation=Image.BICUBIC),
    CenterCrop(input_resolution),
    ToTensor()
])

image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda()
image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda()

In [None]:
! pip install ftfy regex
! wget https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz -O bpe_simple_vocab_16e6.txt.gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title

import gzip
import html
import os
from functools import lru_cache

import ftfy
import regex as re


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class SimpleTokenizer(object):
    def __init__(self, bpe_path: str = "bpe_simple_vocab_16e6.txt.gz"):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
        merges = merges[1:49152-256-2+1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v+'</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
        return text


In [None]:
import exifread

In [None]:
import os
from PIL import Image

images = []
not_proc_images = []
file_names = []
dates = []
hashes = {}
duplicates = []
duplicates_file_names = []
for root, __, files in os.walk("/content/drive/MyDrive/dataset"):
  for f in files:
      if f.endswith(".jpg"):
         im = Image.open(os.path.join(root, f))
         temp_hash = imagehash.average_hash(im, 8)
         if temp_hash in hashes:
          duplicates.append(im)
          duplicates_file_names.append(f)
         else:
          file_names.append(f)
          hashes[temp_hash] = im
          with open(os.path.join(root, f), "rb") as file:
            tags = exifread.process_file(file, details=False, stop_tag="DateTimeOriginal")
            try:
              date_path = str(tags["EXIF DateTimeOriginal"])[:10].replace(":", "-")
              dates.append(date_path)
            except:
              date_path = "no_date"
              dates.append(date_path)
          not_proc_images.append(im)
          image = preprocess(im)
          images.append(image)

In [None]:
duplicates_file_names

In [None]:
len(file_names)

In [None]:
text_descriptions = ["This is a photo of a cat", 
                     "This is a photo of a dog",
                     "This is a selfie",
                     "This is a photo of a group of people",
                     "This is a photo of nature",
                     "This is photo of a meme",
                     "This is a photo of food",
                     "This is a photo of notes",
                     "This is a photo of clothes",
                     "This is a photo of a car"
                     ]

In [None]:
image_input = torch.tensor(np.stack(images)).cuda()
image_input -= image_mean[:, None, None]
image_input /= image_std[:, None, None]

In [None]:
tokenizer = SimpleTokenizer()
sot_token = tokenizer.encoder['<|startoftext|>']
eot_token = tokenizer.encoder['<|endoftext|>']

text_tokens = [[sot_token] + tokenizer.encode(desc) + [eot_token] for desc in text_descriptions]
text_input = torch.zeros(len(text_tokens), model.context_length, dtype=torch.long)

for i, tokens in enumerate(text_tokens):
    text_input[i, :len(tokens)] = torch.tensor(tokens)

text_input = text_input.cuda()
text_input.shape

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    image_features /= image_features.norm(dim=-1, keepdim=True) # 512 -> 256 -> 1 (1/0) (N -> 512)
    text_features = model.encode_text(text_input).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    top_probs, top_labels = text_probs.cpu().topk(3, dim=-1)

In [None]:
import matplotlib.pyplot as plt

# Результаты

In [None]:
import cv2

In [None]:
import os

In [None]:
for i, image in enumerate(images):
    folder_name = [text_descriptions[index].split(' ')[-1] for index in top_labels[i].numpy()][0]
    path = f"/content/drive/MyDrive/results/{folder_name}/date={dates[i]}/"
    try:
        os.makedirs(path)
    except FileExistsError:
        pass
    file_name = f'/content/drive/MyDrive/results/{folder_name}/date={dates[i]}/{file_names[i]}'
    cv2.imwrite(file_name, cv2.cvtColor(np.array(not_proc_images[i]), cv2.COLOR_RGB2BGR))

In [None]:
for i, image in enumerate(duplicates):
    path = f"/content/drive/MyDrive/results/duplicates"
    try:
        os.makedirs(path)
    except FileExistsError:
        pass
    file_name = f'/content/drive/MyDrive/results//duplicates/{duplicates_file_names[i]}'
    cv2.imwrite(file_name, cv2.cvtColor(np.array(duplicates[i]), cv2.COLOR_RGB2BGR))

In [None]:
plt.figure(figsize=(50, 70))

for i, image in enumerate(images):
    plt.subplot(40, 10, 2 * i + 1)
    plt.imshow(image.permute(1, 2, 0))
    plt.axis("off")

    plt.subplot(40, 10, 2 * i + 2)
    y = np.arange(top_probs.shape[-1])
    plt.grid()
    plt.barh(y, top_probs[i])
    plt.gca().invert_yaxis()
    plt.gca().set_axisbelow(True)
    plt.yticks(y, [text_descriptions[index].split(' ')[-1] for index in top_labels[i].numpy()])
    # plt.xlabel("probability")

plt.subplots_adjust(wspace=0.5)
plt.show()

In [None]:
!pip install flask_ngrok
!pip install flask_restplus
!pip install exifread

In [None]:
def process_files():
  images = []
  not_proc_images = []
  file_names = []
  dates = []
  hashes = {}
  duplicates = []
  duplicates_file_names = []
  for root, __, files in os.walk("/content/uploaded"):
    for f in files:
        if f.endswith(".jpg"):
          im = Image.open(os.path.join(root, f))
          temp_hash = imagehash.average_hash(im, 8)
          if temp_hash in hashes:
            duplicates.append(im)
            duplicates_file_names.append(f)
          else:
            file_names.append(f)
            hashes[temp_hash] = im
            with open(os.path.join(root, f), "rb") as file:
              tags = exifread.process_file(file, details=False, stop_tag="DateTimeOriginal")
              try:
                date_path = str(tags["EXIF DateTimeOriginal"])[:10].replace(":", "-")
                dates.append(date_path)
              except:
                date_path = "no_date"
                dates.append(date_path)
            not_proc_images.append(im)
            image = preprocess(im)
            images.append(image)

  text_descriptions = ["This is a photo of a cat", 
                     "This is a photo of a dog",
                     "This is a selfie",
                     "This is a photo of a group of people",
                     "This is a photo of nature",
                     "This is photo of a meme",
                     "This is a photo of food",
                     "This is a photo of notes",
                     "This is a photo of clothes",
                     "This is a photo of a car"
                     ]
  image_input = torch.tensor(np.stack(images)).cuda()
  image_input -= image_mean[:, None, None]
  image_input /= image_std[:, None, None]
  tokenizer = SimpleTokenizer()
  sot_token = tokenizer.encoder['<|startoftext|>']
  eot_token = tokenizer.encoder['<|endoftext|>']

  text_tokens = [[sot_token] + tokenizer.encode(desc) + [eot_token] for desc in text_descriptions]
  text_input = torch.zeros(len(text_tokens), model.context_length, dtype=torch.long)

  for i, tokens in enumerate(text_tokens):
      text_input[i, :len(tokens)] = torch.tensor(tokens)

  text_input = text_input.cuda()
  text_input.shape
  with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    image_features /= image_features.norm(dim=-1, keepdim=True) # 512 -> 256 -> 1 (1/0) (N -> 512)
    text_features = model.encode_text(text_input).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    top_probs, top_labels = text_probs.cpu().topk(3, dim=-1)
  for i, image in enumerate(images):
    folder_name = [text_descriptions[index].split(' ')[-1] for index in top_labels[i].numpy()][0]
    path = f"/content/drive/MyDrive/results/{folder_name}/date={dates[i]}/"
    try:
        os.makedirs(path)
    except FileExistsError:
        pass
    file_name = f'/content/drive/MyDrive/results/{folder_name}/date={dates[i]}/{file_names[i]}'
    cv2.imwrite(file_name, cv2.cvtColor(np.array(not_proc_images[i]), cv2.COLOR_RGB2BGR))
  for i, image in enumerate(duplicates):
    path = f"/content/drive/MyDrive/results/duplicates"
    try:
        os.makedirs(path)
    except FileExistsError:
        pass
    file_name = f'/content/drive/MyDrive/results//duplicates/{duplicates_file_names[i]}'
    cv2.imwrite(file_name, cv2.cvtColor(np.array(duplicates[i]), cv2.COLOR_RGB2BGR))

In [None]:
from flask import Flask
from flask_ngrok import run_with_ngrok
import os

app = Flask(__name__)
run_with_ngrok(app)
app.secret_key = "secret key"

import werkzeug
werkzeug.cached_property = werkzeug.utils.cached_property
from flask_restplus import Api, Resource
from werkzeug.datastructures import FileStorage

import os

api = Api(app)
upload_parser = api.parser()
upload_parser.add_argument('file',
                           location='files',
                           type=FileStorage)

#here u add function that execute group images
@api.route('/upload/')
@api.expect(upload_parser)
class File(Resource):
    def post(self):
        args = upload_parser.parse_args()
        file = args.get('file')
        file.save('/content/zip_uploaded.zip')
        preprocessing()
        process_files()
                        ##HERE

In [None]:
import shutil
import os
#unzip
def preprocessing():
  shutil.unpack_archive('/content/zip_uploaded.zip', '/content')

In [None]:
#enter point
app.run()