In [None]:
!pip install -q datasets -U
!pip install -q -U google-generativeai

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
API_key = user_secrets.get_secret("API_key")
HF_token = user_secrets.get_secret("HF_token")

In [None]:
import google.generativeai as genai

In [None]:
genai.configure(api_key="")
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('{HF_token}')"

In [None]:
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from datasets import Dataset
import copy

In [None]:
import time

def run_query(query, 
              max_output_tokens=16000, 
              temperature=0.5):
    system_message = ""
    system_message += query
    model = genai.GenerativeModel('gemini-pro')
    chat = model.start_chat(history=[])
    try:
        response = chat.send_message(
            system_message, 
            safety_settings={
                'HARM_CATEGORY_HARASSMENT': 'block_none',
                'HARM_CATEGORY_HATE_SPEECH': 'block_none',
                'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'block_none',
                'HARM_CATEGORY_DANGEROUS_CONTENT': 'block_none'
            }, 
            generation_config=genai.types.GenerationConfig(
            candidate_count=1,
            max_output_tokens=max_output_tokens,
            temperature=temperature)
        )
    except Exception as e:
        print(e)
        time.sleep(10)
    return response.text

In [None]:
def main():
  list_of_conversations = []
  for conv in tqdm(processing_vi_conversations):
    genai.configure(api_key=API_key)
    msgs = [x['value'] for x in conv]
    
    list_of_prompts = []
    for msg in msgs:  
      prompt = f"""Please translate the following English string to Vietnamese, preserving the same format, output just the string:
    English string: {msg}
    Translate to Vietnamese:"""
      list_of_prompts.append(prompt)
    try:
        
      tasks = []
      for i in range(len(list_of_prompts)):
        result = run_query(list_of_prompts[i])
        tasks.append(result)
        
      list_of_conversations.append(tasks)
    
    except Exception as e:
      list_of_conversations = []
      break
      
  return list_of_conversations 


In [None]:
steps = 100
en_dataset = load_dataset("Lin-Chen/ShareGPT4V", 'ShareGPT4V')
vi_dataset =  load_dataset("Oztobuzz/Processed_Vi_ShareGPT4V", 'default')


existed_vi_ids = vi_dataset["train"]['id']
not_existed_vi_conversations = [x for x in en_dataset["train"] if x['id'] not in existed_vi_ids]  
print(f'number of not_existed_vi_conversations: {len(not_existed_vi_conversations)}')
# Start from index 0
not_existed_vi_conversations = not_existed_vi_conversations[0:4500] 
print(f'number of not_existed_vi_conversations: {len(not_existed_vi_conversations)}')

dataset_dict = {"id": [], "image": [], "en_conversations": [], "vi_conversations": []}

for k in range(0, len(not_existed_vi_conversations), steps):  
  print(f"Processing from step {k} ...")
  try:
    processing_vi = not_existed_vi_conversations[k:k+steps]
    processing_vi_conversations = copy.deepcopy([x['conversations'] for x in processing_vi])
    vi_conversations = []
    list_of_conversations = main()
    if(len(list_of_conversations) == 0):
        print(f"Step {k} is error, will move to next step")
        continue
        
    for i, conv in enumerate(processing_vi_conversations):
        for j, msg in enumerate(conv):
            msg['value'] = list_of_conversations[i][j]
        vi_conversations.append(conv)



    for i in range(0, len(vi_conversations)):
        image =  processing_vi[i]['image']
        en_conversations = processing_vi[i]['conversations']
        img_id =  processing_vi[i]['id']
        image =  processing_vi[i]['image']

        dataset_dict['en_conversations'].append(en_conversations) 
        dataset_dict['vi_conversations'].append(vi_conversations[i]) 
        dataset_dict['image'].append(image) 
        dataset_dict['id'].append(img_id) 

    #Upload to hub every 100 samples
    if(k % 100 == 0):
        first_id = dataset_dict['id'][0]
        print(f"Pushing {first_id} to HF")
        vi_dataset = Dataset.from_dict(dataset_dict)
        vi_dataset.push_to_hub("Oztobuzz/Processed_Vi_ShareGPT4V", f'start_from_{first_id}', data_dir=f"data/start_from_{first_id}")
        dataset_dict = {"id": [], "image": [], "en_conversations": [], "vi_conversations": []}
  except Exception as e:
    print(e)
    continue