In [None]:
!pip install -q datasets
!pip install -q openai
!pip install -q runpod
!pip install -q aiohttp
!pip install -q nest_asyncio

In [2]:
import datasets

In [None]:
ds = datasets.load_dataset('Sunbird/salt', 'text-all')

In [5]:
df_train = ds['train'].to_pandas()

In [None]:
df_train.head()

In [None]:
df_train.shape

In [8]:
import os
import openai
import requests
import time
import getpass
from tqdm.auto import tqdm
import runpod

In [None]:
os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter your OPENAI_API_KEY: ")
os.environ['AUTH_TOKEN'] = getpass.getpass("Enter your AUTH_TOKEN: ")
os.environ['RUNPOD_ENDPOINT_ID'] = getpass.getpass("Enter your RUNPOD_ENDPOINT_ID: ")
os.environ['RUNPOD_API_KEY'] = getpass.getpass("Enter your RUNPOD_API_KEY: ")

In [10]:
RUNPOD_ENDPOINT_ID = os.getenv("RUNPOD_ENDPOINT_ID")
# Set RunPod API Key
runpod.api_key = os.getenv("RUNPOD_API_KEY")

In [11]:
client = openai.OpenAI()

In [12]:
def get_preceeded_sentences(text, model="gpt-4o-mini"):
    promt_template = """
    Create 1-2 sentences which could have preceeded this sentence:

    {text}

    Don't return the original sentence
    """
    prompt = promt_template.format(text=text)
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

The dictionary below represents the language codes available now for the translate endpoint

```python
language_codes: {
    "English": "eng",
    "Luganda": "lug",
    "Runyankole": "nyn",
    "Acholi": "ach",
    "Ateso": "teo",
    "Lugbara": "lgg"
}
```

In [14]:
def translate(text, source_language, target_language):
    endpoint = runpod.Endpoint(RUNPOD_ENDPOINT_ID)
    data = {
        "input": {
            "task": "translate",
            "source_language": source_language,
            "target_language": target_language,
            "text": text.strip(),  # Remove leading/trailing spaces
        }
    }

    response = endpoint.run_sync(data, timeout=600)

    return response.get("translated_text")

In [172]:
def get_dataframe_with_specific_ids(df, ids):
  """
  Returns the dataframe with id values in [597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617]
  """
  return df[df['id'].isin(ids)]

In [173]:
ids = [597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617]
new_df_train = get_dataframe_with_specific_ids(df_train, ids)

In [None]:
new_df_train.shape

In [None]:
text = ("To ensure a successful harvest, it's important to consider the ideal climate for planting. "
"Many gardeners find that certain vegetables thrive when exposed to higher temperatures.")
text

In [None]:
translate(text, "eng", "lug")

In [188]:
prompts_list = []   # {'id': 96, 'eng_prompt': 'xxxx', 'lug_prompt': 'xxx'}

In [None]:
len(prompts_list)

In [None]:
for index, row in tqdm(df_train.iterrows()):  # tqdm(df_train.iloc[1200:].iterrows()):
  eng_prompt = get_preceeded_sentences(row['eng_text'])
  lug_prompt = translate(eng_prompt, 'eng', 'lug')
  ach_prompt = translate(eng_prompt, 'eng', 'ach')
  teo_prompt = translate(eng_prompt, 'eng', 'teo')
  lgg_prompt = translate(eng_prompt, 'eng', 'lgg')
  nyn_prompt = translate(eng_prompt, 'eng', 'nyn')
  prompts_list.append(
      {
          'id': row['id'], 'eng_prompt': eng_prompt, 'lug_prompt': lug_prompt,
          'ach_prompt': ach_prompt, 'teo_prompt': teo_prompt, 'lgg_prompt': lgg_prompt,
          'nyn_prompt': nyn_prompt
      }
  )
  time.sleep(1)


In [None]:
prompts_list[:2]

In [None]:
len(prompts_list)

In [180]:
# prompt: change the prompts_list into prompts dataframe

import pandas as pd

prompts_df = pd.DataFrame(prompts_list)


In [None]:
prompts_df.head()

In [None]:
prompts_df.tail()

In [None]:
prompts_df.shape

In [191]:
# prompt: save the prompts_df to csv file

prompts_df.to_csv('prompts_14.csv', index=False)


## Combining all prompt csv files into a single prmpts dataset csv file

In [192]:
import pandas as pd
import glob

# Get all CSV files matching the pattern
csv_files = glob.glob('prompts_*.csv')

# Initialize an empty list to store DataFrames
df_list = []

# Read each CSV file and append it to the list
for filename in csv_files:
    df = pd.read_csv(filename)
    df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_prompts_df = pd.concat(df_list, ignore_index=True)

In [None]:
combined_prompts_df.shape

In [None]:
combined_prompts_df.head()

In [195]:
combined_prompts_df_sorted = combined_prompts_df.sort_values(by=['id'], ascending=True)

In [None]:
combined_prompts_df_sorted.head()

In [None]:
combined_prompts_df_sorted.tail()

In [198]:
combined_prompts_df_sorted = combined_prompts_df_sorted.drop_duplicates(subset=['id'])

In [None]:
combined_prompts_df_sorted.shape

In [200]:
combined_prompts_df_sorted.to_csv('prompts_dataset/prompts.csv', index=False)

### Confirming that prompts dateset is complete and correct

In [156]:
df_train_ids = df_train["id"].to_list()

In [None]:
len(df_train_ids)

In [158]:
combined_ids = combined_prompts_df_sorted["id"].to_list()

In [None]:
len(combined_ids)

In [160]:
def find_difference(list1, list2):
  """
  Finds the difference between two lists using set difference.

  Args:
    list1: The first list.
    list2: The second list.

  Returns:
    A list containing the elements that are in list1 but not in list2.
  """
  diff = list(set(list1) - set(list2))
  return diff

In [161]:
diff = find_difference(df_train_ids, combined_ids)

In [None]:
print(diff)

In [None]:
len(diff)

In [None]:
type(diff[0])

In [None]:
get_dataframe_with_specific_ids(df_train)

## Working with Async (Just testing out async, didn't work as expected)

In [None]:
import asyncio
import time
import random
import aiohttp  # For making async HTTP requests
from openai import OpenAIError  # To handle OpenAI API errors
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Constants for retry logic
MAX_RETRIES = 5
INITIAL_DELAY = 1  # Initial delay between retries (in seconds)
BACKOFF_FACTOR = 2  # Exponential backoff factor"

In [None]:
# Asynchronous version of `get_preceeded_sentences`
async def get_preceeded_sentences_async(text, model="gpt-4o-mini"):
    prompt_template = """
    Create 1-2 sentences which could have preceeded this sentence:

    {text}

    Don't return the original sentence
    """
    prompt = prompt_template.format(text=text)
    messages = [{"role": "user", "content": prompt}]
    
    # Run the blocking API call in a thread pool
    def make_openai_call():
        return client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0
        )

    for attempt in range(MAX_RETRIES):
        try:
            response = await asyncio.to_thread(make_openai_call)
            return response.choices[0].message.content
        except Exception as e:
            print(f"OpenAI Error: {e}. Attempt {attempt + 1} of {MAX_RETRIES}. Retrying...")
            if attempt < MAX_RETRIES - 1:
                delay = INITIAL_DELAY * (BACKOFF_FACTOR ** attempt) + random.uniform(0, 1)
                await asyncio.sleep(delay)
            else:
                raise Exception(f"get_preceeded_sentences failed after {MAX_RETRIES} attempts.")

In [None]:
# Asynchronous version of `translate`
async def translate_async(text, source_language, target_language):
    endpoint = runpod.Endpoint(RUNPOD_ENDPOINT_ID)
    data = {
        "input": {
            "task": "translate",
            "source_language": source_language,
            "target_language": target_language,
            "text": text.strip(),
        }
    }

    # Wrap the blocking `run_sync` call in asyncio.to_thread
    def run_translation_sync():
        return endpoint.run_sync(data, timeout=600)

    # Retry loop
    for attempt in range(MAX_RETRIES):
        try:
            # Run the blocking call in a separate thread
            response = await asyncio.to_thread(run_translation_sync)
            return response.get("translated_text")
        except Exception as e:
            print(f"Runpod Error: {e}. Attempt {attempt + 1} of {MAX_RETRIES}. Retrying...")
            if attempt < MAX_RETRIES - 1:
                delay = INITIAL_DELAY * (BACKOFF_FACTOR ** attempt) + random.uniform(0, 1)
                await asyncio.sleep(delay)
            else:
                raise Exception(f"translate failed after {MAX_RETRIES} attempts.")


In [None]:
# The rest of your code remains the same, with `get_preceeded_sentences_async` and `translate_async` replacing the original synchronous functions
async def process_row_async(row, session):
    try:
        # Get preceded sentences async
        eng_prompt = await get_preceeded_sentences_async(row['eng_text'])

        # Run translation calls concurrently
        lug_prompt, ach_prompt, teo_prompt, lgg_prompt, nyn_prompt = await asyncio.gather(
            translate_async(eng_prompt, 'eng', 'lug'),
            translate_async(eng_prompt, 'eng', 'ach'),
            translate_async(eng_prompt, 'eng', 'teo'),
            translate_async(eng_prompt, 'eng', 'lgg'),
            translate_async(eng_prompt, 'eng', 'nyn')
        )

        # Return a dictionary for the current row
        return {
            'id': row['id'], 'eng_prompt': eng_prompt, 'lug_prompt': lug_prompt,
            'ach_prompt': ach_prompt, 'teo_prompt': teo_prompt, 'lgg_prompt': lgg_prompt,
            'nyn_prompt': nyn_prompt
        }

    except Exception as e:
        print(f"Failed to process row {row['id']}: {e}")
        return None  # Handle error case

In [None]:
async def process_rows(df):
    prompts_list = []

    # Set up a shared aiohttp session for performance
    async with aiohttp.ClientSession() as session:
        # Process each row asynchronously
        tasks = []
        for index, row in tqdm(df.iterrows(), total=len(df)):
            tasks.append(process_row_async(row, session))

        # Gather all results
        results = await asyncio.gather(*tasks)

        # Filter out failed rows (None values)
        prompts_list = [res for res in results if res is not None]

    return prompts_list

In [None]:
def run_translation_process(df_train):
    # Run the asyncio event loop
    loop = asyncio.get_event_loop()
    result = loop.run_until_complete(process_rows(df_train.iloc[2029:2039]))
    return result

In [None]:
# Run the function
prompts_list = run_translation_process(df_train)