In [1]:
gcp_project_id = "fleet-pillar-408114"

In [2]:
import os
from google.cloud import translate_v3beta1 as translate

## Create a Service Account:

In the Cloud Console, navigate to "IAM & Admin" > "Service accounts."
Click "Create Service Account."
Enter a name for service account, select the role(s) that service account needs (Cloud Translation API Editor, Dataplex Storage Data Writer, Storage Object Admin), and click "Continue."

### Generate Key:
After creating the service account, click on it in the "Service accounts" page.
Navigate to the "Keys" tab.
Click on "Add Key" and choose "JSON." This will download a JSON key file containing the necessary credentials.

In [3]:
# give path to json key file

credential_path = "fleet-pillar-408114-fe78fbda4a81.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

## Download the dataset and save in local

### batch translate doesnt support csv, so convert to xlsx 

In [None]:
import pandas as pd

def convert_csv_to_xlsx(input_folder, output_folder):
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through each folder in the main folder
    for folder_name in os.listdir(input_folder):
        folder_path = os.path.join(input_folder, folder_name)

        # Check if it is a directory
        if os.path.isdir(folder_path):
            # Iterate through each CSV file in the subfolder
            for csv_file in os.listdir(folder_path):
                if csv_file.endswith('.csv'):
                    csv_path = os.path.join(folder_path, csv_file)

                    # Read CSV file using pandas
                    df = pd.read_csv(csv_path)

                    # Create Excel file name
                    xlsx_file = os.path.splitext(csv_file)[0] + '.xlsx'
                    xlsx_path = os.path.join(output_folder, folder_name, xlsx_file)

                    # Save DataFrame as Excel
                    df.to_excel(xlsx_path, index=False)

if __name__ == "__main__":
    # Specify the main folder containing subfolders with CSV files
    main_folder = "data"

    # Specify the output folder for Excel files
    output_folder = "data_xlsx"

    # Convert CSV to Excel
    convert_csv_to_xlsx(main_folder, output_folder)


### Go to cloud storage and create two buckets, one for storing input dataset and one to save output of dataset

upload the downloaded dataset to input bucket

## Translate documents (batch)

In [4]:
from google.cloud import translate_v3beta1 as translate

def batch_translate_document(input_uri, output_uri, project_id, timeout: int = 180,) -> translate.BatchTranslateDocumentResponse:
    """Batch translate documents.
    Args:
        input_uri: Google Cloud Storage location of the input document.
        output_uri: Google Cloud Storage location of the output document.
        project_id: The GCP project ID.
        timeout: The timeout for this request.
    Returns:
        Translated document response
    """
    
    client = translate.TranslationServiceClient()

    location = "us-central1"

    # Google Cloud Storage location for the source input. This can be a single file
    # (for example, ``gs://translation-test/input.docx``) or a wildcard
    # (for example, ``gs://translation-test/*``).
    # Supported file types: https://cloud.google.com/translate/docs/supported-formats
    gcs_source = {"input_uri": input_uri}

    batch_document_input_configs = {
        "gcs_source": gcs_source,
    }
    gcs_destination = {"output_uri_prefix": output_uri}
    batch_document_output_config = {"gcs_destination": gcs_destination}
    parent = f"projects/{project_id}/locations/{location}"

    operation = client.batch_translate_document(
        request={
            "parent": parent,
            "source_language_code": "en-US",
            "target_language_codes": ["hi"],   # language to convert to
            "input_configs": [batch_document_input_configs],
            "output_config": batch_document_output_config,
        }
    )

    print("Waiting for operation to complete...")
    response = operation.result(timeout)
    
    print(f"Total Pages: {response.total_pages}")

    return response


#### call the function with paths of google cloud storage folders test, train, val and path of output storage folders to store them

#### it will probably show timeout error, but after sometime the ouput folder will get populated within the output directory.

In [None]:
batch_translate_document(input_uri= "gs://mmlu/data/train/*",
    output_uri= "gs://mmlu_output/data/train/",
    project_id= gcp_project_id)

In [None]:
batch_translate_document(input_uri= "gs://mmlu/data/train/*",
    output_uri= "gs://mmlu_output/data/train/",
    project_id= gcp_project_id)

In [None]:
batch_translate_document(input_uri= "gs://mmlu/data/train/*",
    output_uri= "gs://mmlu_output/data/train/",
    project_id= gcp_project_id)