<a href="https://colab.research.google.com/github/SunbirdAI/salt/blob/main/notebooks/data_preparation/pronoun_modification_for_english_targets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create English target text by converting pronouns

In some local languages, gender is not specified by default in a pronoun. When translating into English, we therefore need to train the models not to invent a gender when none has been specified in the source text. This notebook creates English target text by ambiguating gender unless it can be inferred from the context.

In [None]:
#!git clone https://github.com/sunbirdai/salt.git
#!pip install -qr salt/requirements.txt
!pip install -q openai
!pip install -q datasets

In [None]:
import yaml
import pandas as pd
import openai
import numpy as np
import re
from tqdm.notebook import tqdm
import multiprocessing
import datasets
import pydantic

In [None]:
from google.colab import userdata
client = openai.OpenAI(
    api_key=userdata.get('OPENAI_API_KEY'),
)

In [None]:
#@title Helper functions
def ambiguate_gender(s):
  '''Replace pronouns with ambiguous placeholders.'''

  substitutions = {
      'he': 'HE_SHE',
      'she': 'HE_SHE',
      'his': 'HIS_HER',
      'hers': 'HIS_HERS',
      'her': 'HIM_HER_HIS',
      'him': 'HIM_HER',
      'himself': 'HIMSELF_HERSELF',
      'herself': 'HIMSELF_HERSELF',
      'he\'s': 'HES_SHES',
      'she\'s': 'HES_SHES',
  }

  for word, sub in substitutions.items():
    s = re.sub(r'\b'+word.lower()+r'\b', sub, s, flags=re.IGNORECASE)

  return s

class TextResponse(pydantic.BaseModel):
    text: str

def modify_pronouns(text):
  '''Given text with HE_SHE (etc) placeholders, reconstruct a sentence.

  Args:
    input: (index : int, text: str) tuple

  Returns:
    index, reconstructed_text
  '''

  instruction = (
    """
**Task:** Correct the provided list of sentences by replacing placeholders
(HE_SHE, HIS_HER, HIM_HER_HIS) with appropriate pronouns, making it readable.
Try to infer the gender from the context, otherwise use "he/she", "they", or
"theirs" as appropriate. Also clean up any formatting or spelling issues.

**Examples:**
Input: HE_SHE gave HIM_HER_HIS pen to the girl so HE_SHE could do HIS_HER homework.
Output: He/she gave their pen to the girl so she could do her homework.
# 'the girl' inferred as female.

Input: The chairman said HE_SHE would call tomorrow.
Output: The chairman said he would call tomorrow.
# 'chairman' inferred as male.

Input: are u with HIM_HER?
Output: Are you with him/her?
# gender can't be inferred here.

Input: HIS_HER brother came home.
Output: His/her brother came home.
# gender can't be inferred here
"""
)

  ambiguated = ambiguate_gender(text)

  # There might be nothing to ambiguate.
  if ambiguated == text:
    return text

  try:
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": instruction},
            {"role": "user", "content": ambiguated},
        ],
        response_format=TextResponse,
    )

    response = completion.choices[0].message

    if response.parsed:
      return response.parsed.text

    if response.refusal:
      print(response.refusal)

    return text

  except Exception as e:
    print(e, text)
    return text

def modify_with_index(input):
  index, text = input
  return index, modify_pronouns(text)

In [None]:
modify_pronouns('She was on the way to give birth.')

In [None]:
SOURCE_REPO = 'Sunbird/external-translation-datasets'
SOURCE_CONFIG = 'mt560_ibo.parquet'
TARGET_REPO = 'jq/external-translation-data'
TARGET_CONFIG = 'mt560_ibo'
SPLIT = 'train'

ds = datasets.load_dataset(SOURCE_REPO, SOURCE_CONFIG)
split = SPLIT
df = ds[split].to_pandas()

rows_to_alter = df['eng_text'].apply(ambiguate_gender) != df['eng_text']
num_to_process = np.sum(rows_to_alter)
print(f"{num_to_process} of {len(df)} examples to be converted.")

In [None]:
inputs = list(
    zip(range(len(df)), list(df['eng_text']))
)
with multiprocessing.Pool(50) as p:
  completions = list(
      tqdm(p.imap_unordered(modify_with_index, inputs), total=len(df)))

if 'eng_target_text' in df.columns:
  completions_ordered = list(df['eng_target_text'])
else:
  completions_ordered = list(df['eng_text'])

for i, text in completions:
  if text:
    completions_ordered[i] = text
  else:
    print(f'Falling back to eng_text for row {i}')
    completions_ordered[i] = df['eng_text'][i]

df['eng_target_text'] = completions_ordered
df.rename(columns={'eng_text': 'eng_source_text'}, inplace=True)

In [None]:
df[rows_to_alter]

In [None]:
ds = datasets.Dataset.from_pandas(df)
ds.push_to_hub(TARGET_REPO,
               config_name=TARGET_CONFIG,
               private=False,
               split=SPLIT)

# Create instruction text

Use only the versions of datasets with corrected pronouns as above.

In [None]:
def row_to_instruction(row):
  return f"""Translate to English

# Input
{row['lug_text']}

# Output
{row['eng_target_text']}
"""

In [None]:
repo = 'jq/external_mt_datasets_with_eng_target'

# SALT dataset
ds = datasets.load_dataset(
    'jq/salt_with_eng_target', 'text-all', split='train')
df = ds.to_pandas()
examples = list(df.apply(row_to_instruction, axis=1))
print(len(examples))

# External datasets
configs = [
  'lafand-en-lug-combined.parquet',
  'ai4d.parquet',
  'bt_from-eng-google.parquet',
  'flores200.parquet',
  'lafand-en-lug-combined.parquet',
  'mozilla_110.parquet',
  'tico19.parquet',
]

for config in configs:
  ds = datasets.load_dataset(
      'jq/external_mt_datasets_with_eng_target', config, split='train')
  df = ds.to_pandas()
  examples.extend(list(df.apply(row_to_instruction, axis=1)))
  print(len(examples))

In [None]:
df = pd.DataFrame()
df['instruction'] = examples

In [None]:
ds = datasets.Dataset.from_pandas(df)
ds.push_to_hub("Sunbird/sunflower-data", config_name='translation-to-eng-corrected', private=True, split='train')