In [None]:
# Install necessary libraries
!pip install deep_translator pandas google-cloud-aiplatform google-auth requests

import pandas as pd
from deep_translator import GoogleTranslator
from google.colab import drive
import re
import string
from google.colab import auth
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, Content, Part, GenerationConfig

# Mount Google Drive
drive.mount('/content/drive')

# Authenticate to Google Cloud
auth.authenticate_user()

# Set up your Google Cloud project and location
PROJECT_ID = 'YOUR-PROJECT_ID'  # Replace with your actual project ID
LOCATION = 'YOUR_LOCATION'    # Replace with the Google Cloud region you want to use

# Initialize Vertex AI with the specified project and location
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Define file paths
input_file_path = 'FILE_PATH'

# Load the CSV file
df = pd.read_csv(input_file_path)

# Replace NaN values with empty strings
df = df.fillna('')

# Truncate texts to a maximum of 4999 characters
df['human'] = df['human'].apply(lambda x: x[:4999] if len(x) > 4999 else x)
df['assistant'] = df['assistant'].apply(lambda x: x[:4999] if len(x) > 4999 else x)

# Supported languages for deep_translator
supported_languages_deep_translator = {
    "afrikaans": "[Afrikaans] (Afrikaans)",
    "albanian": "[Shqip] (Albanian)",
    "amharic": "[አማርኛ] (Amharic)",
    "arabic": "[العربية] (Arabic)",
    "armenian": "[Հայերեն] (Armenian)",
    "assamese": "[অসমীয়া] (Assamese)",
    "aymara": "[Aymara] (Aymara)",
    "azerbaijani": "[Azərbaycanca] (Azerbaijani)",
    "bambara": "[Bamanankan] (Bambara)",
    "basque": "[Euskara] (Basque)",
    "belarusian": "[Беларуская] (Belarusian)",
    "bengali": "[বাংলা] (Bengali)",
    "bhojpuri": "[भोजपुरी] (Bhojpuri)",
    "bosnian": "[Bosanski] (Bosnian)",
    "bulgarian": "[Български] (Bulgarian)",
    "catalan": "[Català] (Catalan)",
    "cebuano": "[Cebuano] (Cebuano)",
    "chichewa": "[Chinyanja] (Chichewa)",
    "chinese (simplified)": "[简体中文] (Chinese, Simplified)",
    "chinese (traditional)": "[繁體中文] (Chinese, Traditional)",
    "corsican": "[Corsu] (Corsican)",
    "croatian": "[Hrvatski] (Croatian)",
    "czech": "[Čeština] (Czech)",
    "danish": "[Dansk] (Danish)",
    "dhivehi": "[ދިވެހި] (Dhivehi)",
    "dogri": "[डोगरी] (Dogri)",
    "dutch": "[Nederlands] (Dutch)",
    "english": "[English] (English)",
    "esperanto": "[Esperanto] (Esperanto)",
    "estonian": "[Eesti] (Estonian)",
    "ewe": "[Eʋegbe] (Ewe)",
    "filipino": "[Filipino] (Filipino)",
    "finnish": "[Suomi] (Finnish)",
    "french": "[Français] (French)",
    "frisian": "[Frysk] (Frisian)",
    "galician": "[Galego] (Galician)",
    "georgian": "[ქართული] (Georgian)",
    "german": "[Deutsch] (German)",
    "greek": "[Ελληνικά] (Greek)",
    "guarani": "[Avañe'ẽ] (Guarani)",
    "gujarati": "[ગુજરાતી] (Gujarati)",
    "haitian creole": "[Kreyòl Ayisyen] (Haitian Creole)",
    "hausa": "[Hausa] (Hausa)",
    "hawaiian": "[ʻŌlelo Hawaiʻi] (Hawaiian)",
    "hebrew": "[עברית] (Hebrew)",
    "hindi": "[हिन्दी] (Hindi)",
    "hmong": "[Hmoob] (Hmong)",
    "hungarian": "[Magyar] (Hungarian)",
    "icelandic": "[Íslenska] (Icelandic)",
    "igbo": "[Asụsụ Igbo] (Igbo)",
    "ilocano": "[Ilokano] (Ilocano)",
    "indonesian": "[Bahasa Indonesia] (Indonesian)",
    "irish": "[Gaeilge] (Irish)",
    "italian": "[Italiano] (Italian)",
    "japanese": "[日本語] (Japanese)",
    "javanese": "[Basa Jawa] (Javanese)",
    "kannada": "[ಕನ್ನಡ] (Kannada)",
    "kazakh": "[Қазақша] (Kazakh)",
    "khmer": "[ខ្មែរ] (Khmer)",
    "kinyarwanda": "[Ikinyarwanda] (Kinyarwanda)",
    "konkani": "[कोंकणी] (Konkani)",
    "korean": "[한국어] (Korean)",
    "krio": "[Krio] (Krio)",
    "kurdish": "[Kurdî] (Kurdish, Kurmanji)",
    "kyrgyz": "[Кыргызча] (Kyrgyz)",
    "lao": "[ລາວ] (Lao)",
    "latin": "[Latina] (Latin)",
    "latvian": "[Latviešu] (Latvian)",
    "lingala": "[Lingála] (Lingala)",
    "lithuanian": "[Lietuvių] (Lithuanian)",
    "luganda": "[Luganda] (Luganda)",
    "luxembourgish": "[Lëtzebuergesch] (Luxembourgish)",
    "macedonian": "[Македонски] (Macedonian)",
    "maithili": "[मैथिली] (Maithili)",
    "malagasy": "[Malagasy] (Malagasy)",
    "malay": "[Bahasa Melayu] (Malay)",
    "malayalam": "[മലയാളം] (Malayalam)",
    "maltese": "[Malti] (Maltese)",
    "maori": "[Te Reo Māori] (Maori)",
    "marathi": "[मराठी] (Marathi)",
    "meiteilon (manipuri)": "[ꯃꯩꯇꯩꯂꯣꯟ] (Meiteilon, Manipuri)",
    "mizo": "[Mizo ṭawng] (Mizo)",
    "mongolian": "[Монгол] (Mongolian)",
    "myanmar": "[မြန်မာစာ] (Burmese)",
    "nepali": "[नेपाली] (Nepali)",
    "norwegian": "[Norsk] (Norwegian)",
    "odia (oriya)": "[ଓଡ଼ିଆ] (Odia, Oriya)",
    "oromo": "[Afaan Oromoo] (Oromo)",
    "pashto": "[پښتو] (Pashto)",
    "persian": "[فارسی] (Persian)",
    "polish": "[Polski] (Polish)",
    "portuguese": "[Português BR] (Portuguese BR)",
    "punjabi": "[ਪੰਜਾਬੀ] (Punjabi)",
    "quechua": "[Runa Simi I] (Central Quechua)",
    "romanian": "[Română] (Romanian)",
    "russian": "[Русский] (Russian)",
    "samoan": "[Gagana Sāmoa] (Samoan)",
    "sanskrit": "[संस्कृतम्] (Sanskrit)",
    "scots gaelic": "[Gàidhlig] (Scots Gaelic)",
    "sepedi": "[Sesotho sa Leboa] (Sepedi)",
    "serbian": "[Српски] (Serbian)",
    "sesotho": "[Sesotho] (Sesotho)",
    "shona": "[ChiShona] (Shona)",
    "sindhi": "[سنڌي] (Sindhi)",
    "sinhala": "[සිංහල] (Sinhala)",
    "slovak": "[Slovenčina] (Slovak)",
    "slovenian": "[Slovenščina] (Slovenian)",
    "somali": "[Soomaali] (Somali)",
    "spanish": "[Español] (Spanish)",
    "sundanese": "[Basa Sunda] (Sundanese)",
    "swahili": "[Kiswahili] (Swahili)",
    "swedish": "[Svenska] (Swedish)",
    "tajik": "[Тоҷикӣ] (Tajik)",
    "tamil": "[தமிழ்] (Tamil)",
    "tatar": "[Татарча] (Tatar)",
    "telugu": "[తెలుగు] (Telugu)",
    "thai": "[ไทย] (Thai)",
    "tigrinya": "[ትግርኛ] (Tigrinya)",
    "tsonga": "[Xitsonga] (Tsonga)",
    "turkish": "[Türkçe] (Turkish)",
    "turkmen": "[Türkmençe] (Turkmen)",
    "twi": "[Twi] (Twi)",
    "ukrainian": "[Українська] (Ukrainian)",
    "urdu": "[اردو] (Urdu)",
    "uyghur": "[ئۇيغۇرچە] (Uyghur)",
    "uzbek": "[Oʻzbek] (Uzbek)",
    "vietnamese": "[Tiếng Việt] (Vietnamese)",
    "welsh": "[Cymraeg] (Welsh)",
    "xhosa": "[isiXhosa] (Xhosa)",
    "yiddish": "[ייִדיש] (Yiddish)",
    "yoruba": "[Yorùbá] (Yoruba)",
    "zulu": "[isiZulu] (Zulu)"
}

# Supported languages for Gemini model
supported_languages_gemini = {
    "azerbaijani_2": "[Azərbaycan dili] (Azerbaijani)",
    "javanese_2": "[ꦧꦱꦗꦮ] (Javanese)",
    "kazakh_2": "[Қазақ тілі] (Kazakh)",
    "kurdish_2": "[کوردی] (Kurdish, Sorani)",
    "mongolian_2": "[Монгол хэл] (Mongolian)",
    "myanmar_2": "[မြန်မာ] (Myanmar)",
    "portuguese_2": "[Português EU] (Portuguese EU)",
    "quechua_2": "[Runa Simi II] (Peripheral Quechua)",
    "sindhi_2": "[सिन्धी] (Sindhi)",
    "sundanese_2": "[ᮘᮞ ᮞᮥᮔ᮪ᮓ] (Sundanese)",
    "tswana": "[Setswana] (Tswana)",
    "tatar_2": "[татар теле] (Tatar)",
    "ichibemba": "[Ichibemba] (Bemba)",
    "dari": "[دری] (Dari - Afghanistan)",
    "dzongkha": "[རྫོང་ཁ] (Dzongkha)",
    "kirundi": "[Ikirundi] (Kirundi)",
    "chittagonian": "[চাটগাঁইয়া] (Chittagonian)",
    "sylheti": "[সিলেটি] (Sylheti)",
    "sardinian": "[Sardu] (Sardinian)",
    "wolof": "[Wollof] (Wolof)",
    "balochi": "[بلوچی] (Balochi)"
}

# Function to get the language value from the key in the language map
def get_language_value_from_key(lang_key):
    if lang_key in supported_languages_deep_translator:
        return supported_languages_deep_translator[lang_key]
    elif lang_key in supported_languages_gemini:
        return supported_languages_gemini[lang_key]
    return lang_key  # Fallback to the original key if not found

# Combine both dictionaries into one ordered list
combined_languages = list(supported_languages_deep_translator.keys()) + list(supported_languages_gemini.keys())

# List supported languages
print("Supported languages for translation:")
for lang in combined_languages:
    print(lang)

# Get starting language from user
chosen_language = input("Please choose a language to start from the list above: ")

# Ensure the starting language is valid
if chosen_language not in combined_languages:
    raise ValueError(f"The chosen language '{chosen_language}' is not supported.")

# Find the starting index
start_index = combined_languages.index(chosen_language)

# Function to clean message text
def clean_message_text(message_text):
    cleaned_text = re.sub(r'[{}\"\[\]]', '', message_text)
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))
    return cleaned_text

# Function to translate using Gemini model
def translate_with_gemini(text, target_lang):
    prompt = f"You are a translator who provides precise, word-for-word translations. Always respond exclusively in the requested language and avoid any additional commentary or English. Translate the following text to {target_lang}: {text}"
    gemini_model = GenerativeModel(model_name='gemini-1.5-flash-001')
    generation_config = GenerationConfig(
        temperature=0.5,
        max_output_tokens=512,
        top_p=0.9,
        top_k=40
    )

    try:
        response = gemini_model.generate_content(
            contents=[Content(role="user", parts=[Part.from_text(prompt)])],
            generation_config=generation_config
        )
        return response.text.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to translate using deep_translator
def translate_with_deep_translator(text, target_lang):
    translator = GoogleTranslator(source='auto', target=target_lang)
    return translator.translate(text)

# Translate and save for all languages starting from the chosen one
for lang in combined_languages[start_index:]:
    if lang in supported_languages_deep_translator:
        use_gemini = False
        lang_code = lang
    elif lang in supported_languages_gemini:
        use_gemini = True
        lang_code = supported_languages_gemini[lang]

    # Prepare the new dataframe to store translations
    translated_data = []

    # Translate each question and answer into the current language
    print(f"Translating to {lang}...")
    for index, row in df.iterrows():
        print(f"Translating row {index + 1}/{len(df)}...")
        retry_count = 0
        max_retries = 3

        while retry_count < max_retries:
            try:
                if use_gemini:
                    translated_question = translate_with_gemini(row['human'], lang_code)
                    translated_answer = translate_with_gemini(row['assistant'], lang_code)
                else:
                    translated_question = translate_with_deep_translator(row['human'], lang_code)
                    translated_answer = translate_with_deep_translator(row['assistant'], lang_code)

                if translated_question is None or translated_answer is None:
                    raise ValueError("Translation returned None.")

                # Get the language value using the key
                language_value = get_language_value_from_key(lang)

                translated_data.append({'Question': translated_question, 'Answer': translated_answer, 'Language': language_value})
                break
            except Exception as e:
                print(f"Error translating row {index + 1}: {e}")
                retry_count += 1
                if retry_count >= max_retries:
                    # Use the language value when appending the empty translation
                    language_value = get_language_value_from_key(lang)
                    translated_data.append({'Question': '', 'Answer': '', 'Language': language_value})
                    break
                print(f"Retrying... ({retry_count}/{max_retries})")

    print(f"Completed translation for {lang}")

    # Convert the list of dictionaries to a DataFrame
    translated_df = pd.DataFrame(translated_data)

    # Define the output file path with the current language
    output_file_path = f'FILE_PATH'

    # Save the translated dataframe to a new CSV file
    translated_df.to_csv(output_file_path, index=False)

    print(f"Translation complete. The translated file is saved to {output_file_path}.")