In [1]:
pip install openai

Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.2
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
import openai
import pandas as pd

openai.api_key = "OPEN_AI_KEY"

# Seed Generations

In [None]:
# Seed Generation


# German
def gen_seed_de():
  ret = []
  nv = ['Substantive', 'Verben']
  for word in nv:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": "Generieren Sie 600 einzigartige zufällige {}, die jeweils durch ein Komma getrennt sind".format(word)}
      ]
    )
    ret += response.choices[0].message.content.strip().split(',')
  df = pd.DataFrame(ret)[0].unique()
  return df

# Galician
def gen_seed_gl():
  ret = []
  nv = ['substantivos', 'verbos']
  for word in nv:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": "Xera 600 {} aleatorios únicos separados por coma".format(word)}
      ]
    )
    ret += response.choices[0].message.content.strip().split(',')
  df = pd.DataFrame(ret)[0].unique()
  return df

In [None]:
seeds_de = pd.DataFrame(gen_seed_de())
print('German seeds: ', len(seeds_de))
seeds_de.to_csv('synthetic_data/seeds/seeds_de.csv')

German seeds:  1043


In [8]:
seeds_gl = pd.DataFrame(gen_seed_gl())
print('Galician seeds: ', len(seeds_gl))
seeds_gl.to_csv('synthetic_data/seeds/seeds_gl.csv')

Galician seeds:  1094


# Sentence Generations

In [3]:
# Load seeds

seeds_de = pd.read_csv('synthetic_data/seeds/seeds_de.csv', index_col=0)
seeds_gl = pd.read_csv('synthetic_data/seeds/seeds_gl.csv', index_col=0)

In [4]:
# Generate sentences

# German
def sentence_de_openai(key):
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "Generieren Sie an der Eingabeaufforderung 100 separate Sätze, die durch ein Semikolon getrennt sind"},
        {"role": "user", "content": key},
        {"role": "assistant", "content": "Gärten und Terrassen;Tacos sind gut.;"}
    ]
  )
  global de_tokens 
  de_tokens += int(response['usage']['completion_tokens'])
  return pd.DataFrame(response.choices[0].message.content.strip().split(';'))[0].unique()

# Galician
def sentence_gl_openai(key):
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "Xera 100 frases individuais, separadas só por un punto e coma, desde o indicador"},
        {"role": "user", "content": key},
        {"role": "assistant", "content": "Éste é o problema.;Volvín á sala de chat.;"}
    ]
  )
  global gl_tokens 
  gl_tokens += int(response['usage']['completion_tokens'])
  return pd.DataFrame(response.choices[0].message.content.strip().split(';'))[0].unique()

In [None]:
de_tokens = 0
ret = pd.DataFrame()

# Run the generation function on each seed
for word in seeds_de['0']:
    new_s = pd.DataFrame(sentence_de_openai(word))
    try:
      if (len(new_s) > 10):
        ret = pd.concat([ret, new_s], axis=0)
    except:
        pass       # do nothing with exception and move on, just don't crash haha

# delete duplicates, shuffle
ret = pd.DataFrame(ret[0].unique()).sample(frac = 1)
ret.to_csv('synthetic_data/raw/de-en.de', header=False, index=False)

In [None]:
gl_tokens = 0
ret = pd.DataFrame()
for word in seeds_gl['0']:
    new_s = pd.DataFrame(sentence_gl_openai(word))
    try: 
      if (len(new_s) > 10):
        ret = pd.concat([ret, new_s], axis=0)
    except:
        pass      # do nothing with exception and move on, just don't crash haha

# delete duplicates, shuffle
ret = pd.DataFrame(ret[0].unique()).sample(frac = 1)
ret.to_csv('synthetic_data/raw/gl-en.gl', header=False, index=False)

# Translation

In [3]:
# Translate from German
def translate_de(string):
  response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "Translate from German to English"},
        {"role": "user", "content": string}
    ]
)
  return response.choices[0].message.content


# Translate from Galician
def translate_gl(string):
  response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "Translate from Galician to English"},
        {"role": "user", "content": string}
    ]
)
  return response.choices[0].message.content

In [None]:
df = pd.read_csv('synthetic_data/train/de-en.de', header=None)
print(len(df))
#print(len(df_eng))
df.insert(1, 'eng', [0]*len(df))
for i in range(len(df)):
        try:
            translated = translate_de(df[0].iloc[i])
            df['eng'].iloc[i] = translated
            print(df.iloc[i])
        except:
            # append empty string if errors
            print(i)
            pass
# remove newline chars
for i in range(len(df)):
    df['eng'][i] = df['eng'][i].replace('\n', ' ')
pd.DataFrame(df['eng']).to_csv('synthetic_data/train/de-en.en', header=False, index=False)

In [None]:
df = pd.read_csv('synthetic_data/train/gl-en.gl', header=None)
df.insert(1, 'eng', [0]*len(df))
print(len(df))
for i in range(len(df)):
      try:
          translated = translate_gl(df[0].iloc[i])
          df['eng'].iloc[i] = translated
          print(df.iloc[i])
      except:
          # append empty string if errors
          print(i)
          pass

# remove newline chars
for i in range(len(df)):
    df['eng'][i] = df['eng'][i].replace('\n', ' ')
pd.DataFrame(df['eng']).to_csv('synthetic_data/train/gl-en.en', header=False, index=False)