# Creating an Eng-to-French dictionary from Wiktionary data

This notebook shows how the en-fr-wiktionary.dict file (downloaded from https://en.wiktionary.org/wiki/User:Matthias_Buchmeier#English-French) was wrangled into a pandas dataframe and used with the Levenshtein module to create a dictionary object (saved with the pickle module) for use by the L1SimilarityFeature class.

In [1]:
import pickle
import Levenshtein
import pandas as pd
import re

In [2]:
#https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
def save_obj(obj, name ):
    with open(r'C:/Users/rowena/Documents/MSC/Project/obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(r'C:/Users/rowena/Documents/MSC/Project/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
#import french_dict and whip it into shape
with open('/Users/rowena/Documents/MSC/Project/FrenchCorpus/dictionary/en-fr-enwiktionary.dict', 'r', encoding='utf-8') as myfile:
    french_dict = myfile.read().splitlines()

#trim the beginning of the file
french_dict=[item for item in french_dict[23:]]

#create a list so that every list item is a pair of tuples, first tuple being english entry, second tuple being its french trans
frenchdict_list=[]
for i in range(0, len(french_dict), 2):
    if i+2<=len(french_dict):
        frenchdict_list.append((french_dict[i], french_dict[i + 1]))

#use a dataframe to finish cleaning up the dictionary
french_frame = pd.DataFrame(frenchdict_list) #make df from eng-->fr dict
french_frame.columns=['eng', 'fr'] #rename the two columns
french_frame['eng_tag']= "" #add col to hold <v> <n> etc
french_frame['eng_det']="" #add col to hold SEE: etc
french_frame['fr_tosee']="" #add col to hold the single word we should SEE for this entry
french_frame = french_frame[['eng', 'eng_tag', 'eng_det', 'fr','fr_tosee']] #reorder the cols
french_frame = french_frame.apply(lambda x: x.str.strip() if x.dtype == "object" else x) #strip leading/trailing whitespace
        
#extract the <v> <article> etc tag and put in separate column
for index, row in french_frame.iterrows():
    tagstart=row['eng'].find('<')
    tagend=row['eng'].find('>')+1
    row['eng_det']=row['eng'][tagend:]
    row["eng_tag"] = row['eng'][tagstart:tagend]
    row["eng"] = row['eng'][:tagstart]

In [4]:
#Some entries' french translation field actually contains an instruction to 'SEE' (another entry). We deal with that here
#by copying over such translations into the correct field.

#make a dictionary to perform 'SEE' mappings (initial loop)
french_frame_engfr=french_frame[['eng', 'fr']]
french_frame_engfr = french_frame_engfr.apply(lambda x: x.str.strip() if x.dtype == "object" else x) #strip leading/trailing whitespace
mydict = french_frame_engfr.set_index('eng').to_dict()
mydict = mydict['fr']

#copy over mappings (initial loop)
for index, row in french_frame.iterrows():
    if 'SEE:' in row['eng_det']:
        seepos = row['eng_det'].find('SEE:')+6
        row['fr_tosee']=row['eng_det'][seepos:-1]
        
french_frame['fr_seen'] = french_frame['fr_tosee'].map(mydict)

for index, row in french_frame.iterrows():
    if 'SEE:' in row['eng_det']:
        row['fr'] = row['fr_seen']
    row['fr'] = re.sub('<.*?>', '', row['fr'])

In [5]:
#after copying over the definitions from the SEE references, there are still 343 rows with blank translations
blank_fr=0
for index, row in french_frame.iterrows():
    if row['fr']=='':
        blank_fr+=1
blank_fr

343

In [6]:
#make a dictionary to perform 'SEE' mappings (second loop)
french_frame_engfr=french_frame[['eng', 'fr']]
french_frame_engfr = french_frame_engfr.apply(lambda x: x.str.strip() if x.dtype == "object" else x) #strip leading/trailing whitespace
mydict = french_frame_engfr.set_index('eng').to_dict()
mydict = mydict['fr']

french_frame['fr_seen'] = french_frame['fr_tosee'].map(mydict)

#copy over mappings (second loop)
for index, row in french_frame.iterrows():
    if row['fr']=='':
        row['fr'] = row['fr_seen']
    ##deleted re row

In [7]:
#after copying over the definitions from the second round SEE references, there are still 15 rows with blank translations
blank_fr=0
for index, row in french_frame.iterrows():
    if row['fr']=='':
        blank_fr+=1
blank_fr

15

In [7]:
#make a dictionary to perform 'SEE' mappings (third loop)
french_frame_engfr=french_frame[['eng', 'fr']]
french_frame_engfr = french_frame_engfr.apply(lambda x: x.str.strip() if x.dtype == "object" else x) #strip leading/trailing whitespace
mydict = french_frame_engfr.set_index('eng').to_dict()
mydict = mydict['fr']

french_frame['fr_seen'] = french_frame['fr_tosee'].map(mydict)

#copy over mappings (third loop)
for index, row in french_frame.iterrows():
    if row['fr']=='':
        row['fr'] = row['fr_seen']
    row['fr'] = re.sub('<.*?>', '', str(row['fr'])).strip()
    ##deleted re row

In [8]:
#after copying over the definitions from the third round SEE references, there no blank definitions left
blank_fr=0
for index, row in french_frame.iterrows():
    if row['fr']=='':
        blank_fr+=1
blank_fr

0

In [9]:
#get rid of unwanted columns
french_frame = french_frame.drop(columns=['fr_tosee', 'fr_seen'])

In [10]:
#deal with some anomaly rows that use square brackets and commas oddly, noticed during testing
pattern = re.compile(r'\[[^)]*\]')
for index, row in french_frame.iterrows():
    if str(row['fr']).startswith("[1,"):
        print('old value: ', row['eng'], row['fr'])
        if str(row['eng']).startswith("answer"):
            row['fr']=re.sub(pattern, '', str(row['fr'])).strip()
        if str(row['eng']).startswith("concentrate"):
            row['fr']='concentrer, se concentrer'
        if str(row['eng']).startswith("deracinate"):
            row['fr']=re.sub(pattern, '', str(row['fr'])).strip()
        if str(row['eng']).startswith("fur"):
            row['fr']=re.sub(pattern, '', str(row['fr'])).strip()
        if str(row['eng']).startswith("linear"):
            row['fr']=re.sub(pattern, '', str(row['fr'])).strip()
        if str(row['eng']).startswith("Persian"):
            row['fr']=re.sub(pattern, '', str(row['fr'])).strip()
        if str(row['eng']).startswith("try"):
            row['fr']="essai, dégustation"
        if str(row['eng']).startswith("we"):
            row['fr']=" nous, on"
        print('new value: ', row['eng'], row['fr'])

old value:  answer  [1, 2] répondre
new value:  answer  répondre
old value:  concentrate  [1,2] concentrer, [3] se concentrer
new value:  concentrate  concentrer, se concentrer
old value:  crack  [1,3] essai , [2] dégustation
new value:  crack  [1,3] essai , [2] dégustation
old value:  deracinate  [1, 2] déraciner
new value:  deracinate  déraciner
old value:  fur  [1, 3] pelage
new value:  fur  pelage
old value:  linear  [1,2] linéaire
new value:  linear  linéaire
old value:  Persian  [1,2] Persan
new value:  Persian  Persan
old value:  return  [1, 2] répondre
new value:  return  [1, 2] répondre
old value:  try  [1,3] essai , [2] dégustation
new value:  try  essai, dégustation
old value:  we  [1,2] nous [formal], on [informal]
new value:  we   nous, on


In [11]:
#we observe the number of entries in the dictionary
len(french_frame)

72827

In [12]:
#split rows containing multiple possible translations for their entry, so that each translation is in  separate column.
#The maximum number of translations for an entry is 24
french_frame_2 = french_frame.join(french_frame['fr'].str.split(',', expand=True).add_prefix('fr_alt'))
print('total rows in dataframe is ', len(french_frame_2))
for i in range(25):
    name=str('fr_alt'+str(i))
    print(name, 'has', french_frame_2[name].isnull().sum(), ' blank rows')

total rows in dataframe is  72827
fr_alt0 has 0  blank rows
fr_alt1 has 56133  blank rows
fr_alt2 has 68317  blank rows
fr_alt3 has 71305  blank rows
fr_alt4 has 72247  blank rows
fr_alt5 has 72557  blank rows
fr_alt6 has 72706  blank rows
fr_alt7 has 72758  blank rows
fr_alt8 has 72798  blank rows
fr_alt9 has 72808  blank rows
fr_alt10 has 72816  blank rows
fr_alt11 has 72818  blank rows
fr_alt12 has 72820  blank rows
fr_alt13 has 72822  blank rows
fr_alt14 has 72824  blank rows
fr_alt15 has 72824  blank rows
fr_alt16 has 72825  blank rows
fr_alt17 has 72825  blank rows
fr_alt18 has 72825  blank rows
fr_alt19 has 72825  blank rows
fr_alt20 has 72826  blank rows
fr_alt21 has 72826  blank rows
fr_alt22 has 72826  blank rows
fr_alt23 has 72826  blank rows
fr_alt24 has 72826  blank rows


In [13]:
# IMPROVEMENT! Some translations contain supplementary information in brackets that is not part of the translation.
# Remove any text within brackets from any given definition
pattern = re.compile(r'\([^)]*\)')
pattern2 = re.compile(r'\[[^)]*\]')
for index, row in french_frame_2.iterrows():
    for i in range(25):
        name=str('fr_alt'+str(i))
        text= str(row[name])
        row[name]=re.sub(pattern, '', text).strip()
        text= str(row[name])
        row[name]=re.sub(pattern2, '', text).strip()

In [15]:
#loops through the 24 possible translations of each row, and preserves that which has the lowest 
#Levenshtein distance from its english 
french_frame_2['final']=""
for index, row in french_frame_2.iterrows():
    row['final']=(row['fr_alt0'].strip(), Levenshtein.ratio(row['eng'].strip(), str(row['fr_alt0'].strip())))
    for i in range(25):
        name=str('fr_alt'+str(i))
        if row[name]!="None":
            currentcalc=(str(row[name]).strip(), Levenshtein.ratio(str(row['eng'].strip()), str(row[name]).strip()))
            if row['final'][1]<currentcalc[1] :
                row['final']=currentcalc

In [16]:
#invert the Levenshtein score so that the least similar words have the highest score for the feature
french_frame_2['final_inverted']=''
for index, row in french_frame_2.iterrows():
    row['final_inverted']=(row['final'][0], 1-row['final'][1])

In [17]:
#turns the english entry for any MWE into tuples
for index, row in french_frame_2.iterrows():
    row['eng'] = row['eng'].strip()
    if ' ' in row['eng']:
        row['eng']=tuple(row['eng'].split(' '))

### Simple Dictionary Creation (no POS tagging)

In [18]:
%%time
#creates a dictionary var from the frame. Where there are multiple entries for the same english word,
#only the entry with the lowest(in fact highest as inverted) Levenshtein distance from its translation is kept
blank_dict=[]

for index, row in french_frame_2.iterrows():
    if type(row['eng'])!=tuple:
        row['eng'] = row['eng'].strip()
    if row['eng'] not in [item[0] for item in blank_dict]:
        blank_dict.append((row['eng'], row['final_inverted']))
    elif dict(blank_dict)[row['eng']][1]>row['final_inverted'][1]:
        blank_dict=[item for item in blank_dict if item[0] !=row['eng']]
        blank_dict.append((row['eng'], row['final_inverted']))

Wall time: 23min 51s


In [22]:
blank_dict=dict(blank_dict)
save_obj(blank_dict, 'french_dict_simple')

### Complex Dictionary Creation (with POS tagging)

In [23]:
#create a column which holds the eng + its p.o.s tag
french_frame_2['eng_and_tag']=''
for index, row in french_frame_2.iterrows():
    if type(row['eng'])!=tuple:
        row['eng'] = row['eng'].strip() 
    row['eng_and_tag'] = (row['eng'], row['eng_tag'])

In [24]:
%%time
#creates a dictionary var from the frame. Where there are multiple entries for the same english word + POS tag,
#only the entry with the lowest(in fact highest as inverted) Levenshtein distance from its translation is kept
blank_dict2=[]

for index, row in french_frame_2.iterrows():
    if row['eng_and_tag'] not in [item[0] for item in blank_dict2]:
        blank_dict2.append((row['eng_and_tag'], row['final_inverted']))
    elif dict(blank_dict2)[row['eng_and_tag']][1]>row['final_inverted'][1]:
        blank_dict2=[item for item in blank_dict2 if item[0] !=row['eng_and_tag']]
        blank_dict2.append((row['eng_and_tag'], row['final_inverted']))

Wall time: 26min 53s


In [28]:
blank_dict2=dict(blank_dict2)

In [29]:
save_obj(blank_dict2, 'french_dict_pos')