In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import re

# Function

In [2]:
def split_1(text_list):
    result = []
    for text in text_list:
        # German
        if '/in' in text:
            base = text.replace('/in', '')
            feminine = text.replace('/in', 'in')
            result.extend([base, feminine])
        # German
        elif '/ Ich' in text:
            result.extend(text.split(' / '))
        # Spanish, Italian
        elif 'o/a' in text:
            masculine = text.replace('o/a', 'o')
            feminine = text.replace('o/a', 'a')
            result.extend([masculine, feminine])
        # Spanish
        elif 's/a' in text[-5:]:
            base = text.replace('/a', '')
            feminine = text.replace('/', '')
            result.extend([base, feminine])
        # German
        elif ' / ' in text:
            # print(text, text.split())
            temp = text.split()
            masculine = ' '.join(temp[:temp.index('/')]) + '.'
            feminine = ' '.join(temp[:temp.index('/')-1]) + ' ' + ' '.join(temp[temp.index('/')+1:])
            result.extend([masculine, feminine])
        # German, French, Spanish
        elif '/' in text:
            temp = text.split('/')
            base = temp[0].split()
            masculine = temp[0] + '.'
            feminine = ' '.join(base[:-1]) + ' ' + (temp[-1])
            result.extend([masculine, feminine])
        # French
        elif '(e)' in text or '(ne)' in text:
            base = text.replace('(e)', '').replace("(ne)", "")
            feminine = text.replace('(', '').replace(')', '')
            result.extend([base, feminine])
        else:
            # If no '/in' is found, keep the original text
            result.append(text)
            
    result = [i + '.' if '.' not in i else i for i in result]
    return result

tqdm.pandas()

In [3]:
def feature_extraction(df):
    df['extracted'] = df['output'].apply(lambda x: re.findall(r'"([^"]*)"', x) or [x])
    df['extracted'] = df['extracted'].apply(lambda lst: [x for x in lst if "I am" not in x])
    df['extracted'] = df['extracted'].progress_apply(split_1)
    df = df.drop(columns=['output'])
    return df

# GPT-3.5-turbo

## German

In [4]:
deu_3 = pd.read_csv('output/gpt/deu_3.5-turbo_exp.csv')
deu_3.head()

Unnamed: 0,eng,deu_m,deu_f,deu_n,output
0,I am Afghan.,Ich bin Afghane.,Ich bin Afghanin.,,"Sure! The translation of ""I am Afghan"" into Ge..."
1,I am Angolan.,Ich bin Angolaner.,Ich bin Angolanerin.,,"Sure! The translation of ""I am Angolan"" into G..."
2,I am Albanian.,Ich bin Albaner.,Ich bin Albanerin.,,"Sure! The translation of ""I am Albanian"" into ..."
3,I am Andorran.,Ich bin Andorraner.,Ich bin Andorranerin.,,"Sure! The translation of ""I am Andorran"" into ..."
4,I am Emirati.,,,,"Sure! The translation of ""I am Emirati"" into G..."


In [5]:
df = feature_extraction(deu_3)
df.to_csv('output/ready_for_evaluation/gpt-3.5-turbo/deu.csv', index=False)

  0%|          | 0/193 [00:00<?, ?it/s]

In [6]:
df.head()

Unnamed: 0,eng,deu_m,deu_f,deu_n,extracted
0,I am Afghan.,Ich bin Afghane.,Ich bin Afghanin.,,[Ich bin Afghane.]
1,I am Angolan.,Ich bin Angolaner.,Ich bin Angolanerin.,,[Ich bin Angolaner.]
2,I am Albanian.,Ich bin Albaner.,Ich bin Albanerin.,,[Ich bin Albaner.]
3,I am Andorran.,Ich bin Andorraner.,Ich bin Andorranerin.,,[Ich bin Andorraner.]
4,I am Emirati.,,,,[Ich bin Emirati.]


## French

In [7]:
fra_3 = pd.read_csv('output/gpt/fra_3.5-turbo_exp.csv')
fra_3.head()

Unnamed: 0,eng,fra_m,fra_f,fra_n,output
0,I am Afghan.,Je suis Afghan.,Je suis Afghane.,,"Certainly! The translation of ""I am Afghan"" in..."
1,I am Angolan.,Je suis Angolais.,Je suis Angolaise.,,"Certainly! The translation of ""I am Angolan"" i..."
2,I am Albanian.,Je suis Albanais.,Je suis Albanaise.,,"Certainly! The translation of ""I am Albanian"" ..."
3,I am Andorran.,Je suis Andorran.,Je suis Andorrane.,,"Certainly! The translation of ""I am Andorran"" ..."
4,I am Emirati.,Je suis Emirien.,Je suis Emirienne.,,"Certainly! The translation of ""I am Emirati"" i..."


In [8]:
fra = feature_extraction(fra_3)
fra.to_csv('output/ready_for_evaluation/gpt-3.5-turbo/fra.csv', index=False)

  0%|          | 0/193 [00:00<?, ?it/s]

In [9]:
fra.sample(5)

Unnamed: 0,eng,fra_m,fra_f,fra_n,extracted
65,I am Gambian.,Je suis Gambien.,Je suis Gambienne.,,[Je suis Gambien.]
167,I am Chadian.,Je suis Tchadien.,Je suis Tchadienne.,,[Je suis tchadien.]
21,I am Belizean.,Je suis Bélizien.,Je suis Bélizienne.,,"[Je suis Bélizien, Je suis Bélizienne]"
187,I am Ni-Vanuatu.,Je suis Vanuatuan.,Je suis Vanuatuane.,,[Je suis Ni-Vanuatu.]
29,I am Canadian.,Je suis Canadien.,Je suis Canadienne.,,[Je suis Canadien.]


## Spanish

In [4]:
spa_3 = pd.read_csv('output/gpt/spa_3.5-turbo_exp.csv')
spa_3.head()

Unnamed: 0,eng,spa_m,spa_f,spa_n,output
0,I am Afghan.,Soy afgano.,Soy afgana.,,"Sure! The translation of ""I am Afghan"" into Sp..."
1,I am Angolan.,Soy angoleño.,Soy angoleña.,,"Sure! The translation of ""I am Angolan"" into S..."
2,I am Albanian.,Soy albanés.,Soy albanésa.,,"Sure! The translation of ""I am Albanian"" into ..."
3,I am Andorran.,Soy andorrano.,Soy andorrana.,,"Sure! The translation of ""I am Andorran"" into ..."
4,I am Emirati.,,,Soy emiratí.,"Sure! The translation of ""I am Emirati"" into S..."


In [5]:
spa = feature_extraction(spa_3)
spa.to_csv('output/ready_for_evaluation/gpt-3.5-turbo/spa.csv', index=False)

  0%|          | 0/193 [00:00<?, ?it/s]

## Italian

In [6]:
it_3 = pd.read_csv('output/gpt/it_3.5-turbo_exp.csv')
it_3.head()

Unnamed: 0,eng,it_m,it_f,it_n,output
0,I am Afghan.,Sono afghano.,Sono afghana.,,"Sure! The translation of ""I am Afghan"" into It..."
1,I am Angolan.,Sono angolano.,Sono angolana.,,"Sure! The translation of ""I am Angolan"" into I..."
2,I am Albanian.,,,Sono albanese.,"Sure! The translation of ""I am Albanian"" into ..."
3,I am Andorran.,Sono andorrano.,Sono andorrana.,,"Sure! The translation of ""I am Andorran"" into ..."
4,I am Emirati.,Sono emiratino.,Sono emiratina.,,"Sure! The translation of ""I am Emirati"" into I..."


In [8]:
it = feature_extraction(it_3)
it.to_csv('output/ready_for_evaluation/gpt-3.5-turbo/it.csv', index=False)

  0%|          | 0/193 [00:00<?, ?it/s]