In [98]:
import os
import csv
import time
import pandas as pd
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor    # Concurrent execution using threads
from openai import OpenAI

In [99]:
os.environ["OPENAI_API_KEY"] = "api_key"

In [4]:
# provide dataset name among ai2_arc, gsm8k, lukaemon/mmlu
dataset_name = "ai2_arc"

In [5]:
if(dataset_name == "ai2_arc"):
    possible_configs = [
    'ARC-Challenge',
    'ARC-Easy'
    ]
    # columns to translate
    columns = ['question','choices']
    # columns not to translate, to keep in converted dataset as is.
    columns_asis = ['id','answerKey']

elif(dataset_name == "gsm8k"):
    possible_configs = [
    "main",
    "socratic"
    ]
    # columns to translate
    columns = ['question','answer']
    # columns not to translate, to keep in converted dataset as is.
    columns_asis = []
    
elif(dataset_name == "lukaemon/mmlu"):
    possible_configs = [
    "high_school_european_history",
    "business_ethics",
    "clinical_knowledge",
    "medical_genetics",
    "high_school_us_history",
    "high_school_physics",
    "high_school_world_history",
    "virology",
    "high_school_microeconomics",
    "econometrics",
    "college_computer_science",
    "high_school_biology",
    "abstract_algebra",
    "professional_accounting",
    "philosophy",
    "professional_medicine",
    "nutrition",
    "global_facts",
    "machine_learning",
    "security_studies",
    "public_relations",
    "professional_psychology",
    "prehistory",
    "anatomy",
    "human_sexuality",
    "college_medicine",
    "high_school_government_and_politics",
    "college_chemistry",
    "logical_fallacies",
    "high_school_geography",
    "elementary_mathematics",
    "human_aging",
    "college_mathematics",
    "high_school_psychology",
    "formal_logic",
    "high_school_statistics",
    "international_law",
    "high_school_mathematics",
    "high_school_computer_science",
    "conceptual_physics",
    "miscellaneous",
    "high_school_chemistry",
    "marketing",
    "professional_law",
    "management",
    "college_physics",
    "jurisprudence",
    "world_religions",
    "sociology",
    "us_foreign_policy",
    "high_school_macroeconomics",
    "computer_security",
    "moral_scenarios",
    "moral_disputes",
    "electrical_engineering",
    "astronomy",
    "college_biology",
    ]
    # columns to translate
    columns = ['input','A','B','C','D']
    # columns not to translate, to keep in converted dataset as is.
    columns_asis = ['target']

In [6]:
dataset = []
for config in possible_configs:
    dataset_slice = load_dataset(dataset_name, config,ignore_verifications=True)
    dataset.append(dataset_slice)

Found cached dataset parquet (C:/Users/karti/.cache/huggingface/datasets/parquet/ARC-Challenge-6372c4cac1109a78/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset parquet (C:/Users/karti/.cache/huggingface/datasets/parquet/ARC-Challenge-6372c4cac1109a78/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [92]:
client = OpenAI()
def translate_gpt(text: str):
    completion = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "romanize the given text in english to hinglish.(written in english but sounding like hindi)"},
            {"role": "system", "content": "how are you becomes - kaise ho ap"},
            {"role": "system", "content": "numbers, nouns, verbs stay in English and rest is in Hindi. Do not give original text back"},
            {"role": "system", "content": "do not translate to hindi, translate to hinglish"},
            {"role": "user", "content": text}
        ]
    )
    return completion.choices[0].message.content

In [93]:
res = translate_gpt('{ "text": [ "cooling of flowing magma.", "converging of crustal plates.", "deposition of river sediments.", "solution of carbonate minerals." ], "label": [ "A", "B", "C", "D" ] }')
print(res)

{ "text": [ "flowing magma ki thandi hona.", "crustal plates ki sanyojan.", "river sediments ki raksha.", "carbonate minerals ki hal" ], "label": [ "A", "B", "C", "D" ] }


In [100]:
translate_gpt('The end result in the process of photosynthesis is the production of sugar and oxygen. Which step signals the beginning of photosynthesis?')

'The end result in the process of photosynthesis is sugar aur oxygen ki utpadan. Konsa kadam ishara deta hai photosynthesis ki shuruat ka?'

In [96]:
%%time

for i in range(len(possible_configs)):
    for set in dataset[i]:
        set_list = []
        
        for col in columns:
            values = [str(item[col]) for item in dataset[i][set]]
            
            # Use ThreadPoolExecutor for parallel translation
            if __name__ == '__main__':
                result =[]
                with ThreadPoolExecutor(max_workers=16) as exe:
                    # Maps the method with a list of values.
                    # result = list(exe.map(translate_gpt,values[:1]))
                    
                    batch_size = 1
                    for j in range(0,len(values[:3]),batch_size):
                        j_end = min(j + batch_size, len(values[:3]))
                        res = list(exe.map(translate_gpt,values[j:j_end]))
                        # time.sleep(2)
                        result.extend(res)
                    
            set_list.append(result)

        # Create folders for each configuration
        current_directory = os.getcwd()
        
        # Specify the path of the 'config' folder
        config_folder_path = os.path.join(current_directory, possible_configs[i])
        
        # Create the 'config' folder
        os.makedirs(config_folder_path, exist_ok=True) 
    
        # Transpose the 2D list
        transposed_data = list(map(list, zip(*set_list)))
        
        # to add untranslated columns in dataset
        for row in range(len(transposed_data)):
            for col in columns_asis:
                if col=='id':
                        position = 0
                else:
                    position = len(transposed_data[row])
                transposed_data[row].insert(position, dataset[i][set][col][row]) 
        
            
        path = os.path.join(possible_configs[i], f'{set}.csv')

        # append to previosly created csv file in case full dataset was not converted
        with open(path, 'a', encoding='utf-8') as f:
            # using csv.writer method from CSV package
            write = csv.writer(f)
            # write.writerow(columns)
            write.writerows(transposed_data)


CPU times: total: 1.36 s
Wall time: 1min 10s


In [97]:
print(path)
df = pd.read_csv(path, header=None)

# Print the DataFrame
df.head()

ARC-Easy\test.csv


Unnamed: 0,0,1,2,3
0,Mercury_7175875,Ek astronoom dekhta hai ki ek planet ek meteor...,"{'text': ['Planetary density will decrease.', ...",C
1,Mercury_SC_409171,एक समूह के इंजीनियर्स ने जानना चाहा कि भूकंप क...,'buildings will be built faster' -> 'buildings...,B
2,Mercury_SC_408547,The end result in the process of photosynthesi...,{'text': ['Chemical energy roots mein se graha...,C
