# ***Check GPU connect***

In [None]:
print("oke nice")

In [None]:
import torch 

print(torch.__version__) 
print(torch.cuda.is_available()) 
print(torch.cuda.get_device_name(0))

In [None]:
!nvidia-smi

# ***Content***

## Clone dataset 

In [None]:
!pip install gdown

In [None]:
!git clone https://github.com/QuanHoangNgoc/data-temp.git

## Install

In [None]:
!pip -q install faiss-gpu

In [None]:
import os
from PIL import Image
import numpy as np
import faiss
# from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import random

## Check dataset and create image_files 

In [None]:
def get_imagelist(root_folder, glob="jpg"): 
    paths = []
    for dirpath, dirnames, filenames in sorted(os.walk(root_folder)): #!!! must sorted 
        for file in filenames:  
            fullpath = os.path.join(dirpath, file)
            if(fullpath.split(".")[-1] != glob): continue
            paths.append(fullpath) 
    return paths

In [None]:
image_folder = "/kaggle/working/data-temp"

image_files = get_imagelist(image_folder)
print(len(image_files), image_files[:10]) 
selected_files = random.sample(image_files, 5)

plt.figure(figsize=(20, 5))

for i, file in enumerate(selected_files):
    image = Image.open(file)
    plt.subplot(1, 5, i + 1)
    plt.imshow(image)
    plt.axis("off")

plt.show()

## Create embeddings and meta 

In [None]:
# # new a embedding model 
# model = SentenceTransformer('clip-ViT-B-32')

# CHUNK_SIZE = 256

# # process each chunk 
# def process_chunk(chunk):
#     images = []
#     for image_file in chunk: images.append(Image.open(image_file))

#     chunk_embeddings = model.encode(images)
#     return chunk_embeddings


# # create the embeddings 
# embeddings = []
# for i in range(0, len(image_files), CHUNK_SIZE):
#     print(i)
#     chunk = image_files[i:i + CHUNK_SIZE]
#     embeddings.extend(process_chunk(chunk))
    
# embeddings 

In [None]:
import gdown  

def drive_down(file_id, name_output_file): 
    url = f'https://drive.google.com/uc?id={file_id}'  
    output = name_output_file  
    gdown.download(url, output, quiet=False)

### clone embedding 

In [None]:
file_id = '1XdR4P7RyK68wGruH1Fw2GNyiNpRHYLpT' 
drive_down(file_id, "emb.npy")
embeddings = np.load('emb.npy')
embeddings.shape 

### clone meta

In [None]:
import pickle

file_id = '1B-QZqlyoLW8oc4lyi6_tnV7MoUDA84Z-'
drive_down(file_id, "meta.pkl")
with open('meta.pkl', 'rb') as file:
    meta = pickle.load(file)
print(type(meta), len(meta), meta[:10])

## Create database 

In [None]:
# build vector db 
def create_database(embeddings): 
    dimension = len(embeddings[0])
    db = faiss.IndexFlatIP(dimension)
    db = faiss.IndexIDMap(db)

    # insert embeddings into database 
    vectors = np.array(embeddings).astype('float32')
    db.add_with_ids(vectors, np.array(range(len(embeddings))))
    return db 

In [None]:
db = create_database(embeddings)
db 

## Prepare

In [None]:
!pip install sentence-transformers==2.2.2
!pip install torchscale

### set-up beit3 repo

In [None]:
file_id = '1Xf3XDVR59ONPNemQ0URS7-djDGssfXaN'
drive_down(file_id, "beit3.zip")
!unzip beit3.zip 
os.listdir('/kaggle/working/BEiT3/checkpoints')

In [None]:
file_id = '1e8qULfZLu26e8-wHdtJxCC6wfG72IWMp'
drive_down(file_id, "beit3.pth")

### beit3 encoder model 

In [None]:
import BEiT3.beit3 as module

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "beit3.pth"
sentencepiece_model = os.path.join('/kaggle/working/BEiT3/checkpoints', 'beit3.spm')
model, transform, tokenizer = module.load_model(device, checkpoint, sentencepiece_model)

### translate 

In [None]:
!pip install googletrans==3.1.0a0

In [None]:
from googletrans import Translator
translator = Translator()

## Process 

In [None]:
from collections import defaultdict

index_namefile = defaultdict(list)
for index, namefile in enumerate(meta): 
    index_namefile[namefile] = index
    
vectors = np.array(embeddings).astype('float32')
vectors.shape 

In [None]:
def stt_to_frameid(stt: int): 
    return (stt-1) * 25 

def frameid_to_stt(frameid: int): 
    return (frameid // 25) + 1

In [None]:
def encode(component: str):
    if component.endswith(".jpg") or component.endswith(".png"):
        image = Image.open(component)
        emb = module.encode_image(model, transform, image, device)
        emb = emb.astype("float32").reshape(1, -1)
        return emb 
    
    elif len(component.split(",")) == 2: 
        video, frame_id = component.split(",")
        frame_id = int(frame_id)
        stt = frameid_to_stt(frame_id)
        namefile = f"{video}_{stt}.jpg"
        i = index_namefile[namefile]
        return vectors[i].reshape(1, -1) 
    
    text = translator.translate(component, dest='en').text
    emb = module.encode_text(model, tokenizer, text, device)
    emb = emb.astype("float32").reshape(1, -1)
    return emb 
    
def norm_n2d(n2d):
    matrix = np.array(n2d).astype('float32')
    # Calculate the norms of each row (vector)  
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)  
    # Normalize each vector (handling possible division by zero)  
    normalized_matrix = np.divide(matrix, norms, out=np.zeros_like(matrix), where=norms!=0)  
    return normalized_matrix

In [None]:
def compute_cosines(vectors_norm, emb_norm): 
    cosine_similarities = np.dot(vectors_norm, emb_norm.reshape(-1))
    return cosine_similarities

def argsort(cosine_similarities):
    K = len(cosine_similarities) 
    argsort_indices = np.argsort(cosine_similarities)[-K:][::-1]
    return argsort_indices # permutation

In [None]:
def db_indices_to_files(db_indices, fn_name=">>>db_inds to files\n"): 
    files = [meta[i] for i in db_indices] 
    print(fn_name, files[:5]) 
    return files 

def namefiles_to_resultlist(namefiles, fn_name=">>>to result_list\n"):
    result = [] 
    for name in namefiles: 
        stt = int(name.split('_')[-1].split('.')[0])
        frame_id = stt_to_frameid(stt=stt) 
        video = name.split('_')[0] + '_' + name.split('_')[1]
        result.append([video, frame_id])
        
    print(fn_name, result[:5])
    return result 

import csv 
def to_csv(result, path='data.csv'): 
    with open(path, 'w', newline='') as csvfile:
        # Create a CSV writer object
        writer = csv.writer(csvfile)
    
        # Write the data to the CSV file
        writer.writerows(result)

In [None]:
def search_on_db(des, db, top_k, fn_name=">>>search on db\n"):
    text = translator.translate(des, dest='en').text
    emb = module.encode_text(model, tokenizer, text, device)
    emb = emb.astype("float32").reshape(1, -1)
    print(fn_name, text, emb.shape) 
    
    rels, db_indices = db.search(emb, top_k)
    print(fn_name, rels[0][:5], db_indices[0][:5])
    return db_indices[0]

In [None]:
def search_two_pharse(des, db, top_k=100, components: tuple=(), top_h=100, path='data.csv'): 
    db_indices = search_on_db(des, db, top_k)
    
    
    if(len(components) > 0): 
        vectors_norm = norm_n2d(vectors[db_indices]) 
        print(">>>component processing...")
        print("-", vectors_norm.shape)
        
        emb = encode(component=components[0])
        emb_norm = norm_n2d(emb) 
        print("-", components[0] + ":", emb_norm.shape) 
        
        scores = compute_cosines(vectors_norm=vectors_norm, emb_norm=emb_norm)
        for i in range(1, len(components)):
            cpn = components[i]
            emb = encode(component=cpn)
            emb_norm = norm_n2d(emb) 
            print("-", cpn + ":", emb_norm.shape) 
            
            scores = scores * compute_cosines(vectors_norm=vectors_norm, emb_norm=emb_norm)
        
        per = argsort(scores) 
        assert len(per) == len(db_indices)
        db_indices = db_indices[per]
        print("-", per[:5], db_indices[:5], scores[per[:5]])
        
        
    db_indices = db_indices[:top_h]
    files = db_indices_to_files(db_indices=db_indices)
    result = namefiles_to_resultlist(namefiles=files)
    to_csv(result=result, path=path)

In [None]:
vectors[[0, 1, 2]].shape
norm_n2d(vectors[[0, 1, 2]]).shape

In [None]:
des = "Một người Viet Nam"
top_k = int(len(meta) * 0.01)
top_h = 100 

cpns = ("Viet nam", "L22_V027,3675")

search_two_pharse(des, db, top_k, cpns, top_h=top_h)

## Main script

In [None]:
import shutil, csv

def main_submit(in_folder, out_folder):
    if os.path.exists(out_folder): 
        shutil.rmtree(out_folder)
    os.makedirs(out_folder, exist_ok=True)
    print(f"-\/- Remove and renew [{out_folder}]\n")
    
    for namefile in os.listdir(in_folder):
        with open(os.path.join(in_folder, namefile), 'r') as file:
            # Read the entire file content
            content = file.read()
            des = content.replace("\n", " ")
            path = os.path.join(out_folder, namefile.replace(".txt", ".csv"))
            
            search_two_pharse(des, db, top_k=100, top_h=100, path=path)
            print("\n")

In [None]:
in_folder = '/kaggle/input/test-dot-2/pack2-groupA'
out_folder = '/kaggle/working/submit'
main_submit(in_folder, out_folder)

# Specify the folder to compress and the output ZIP file name  
folder_to_compress = out_folder
output_zip_file = out_folder + '.zip'  

# Compress the runs to zip 
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_compress)