In [1]:
import sys, os, getopt, re
from functools import wraps
from glob import glob
import pickle
import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd

# Latin Data Splits and Verb Suffixing Corrections
This project was broken up into two main parts: Creating the data splits, and editing the baseline code to more accurately predict the morphological inflections of latin words.

## A. Splitting the Data
Since Latin had no splits, our group needed to manually make the train, test, and dev sets. This task alone has a lot of factors to consider to avoid artificially inflating the accuracy of our code. Additionally, we started with nearly one million lines of data, which needed to be cut down

There are many factors to keep in mind while splitting the data. Below are the two main factors we focused on:

#### Size of the data

The size of the training data correlates with the accuracy of the code (Kodner et al. 2023). By cutting down the data, we save both on artificially raised accuracy and runtime

#### Splitting-by-form vs. Splitting-by-lemma

Splitting-by-form takes random lines from the data to add to either the test, train, or dev sets. This can let the code "cheat" if it is trained on a lemma that also shows up in another form in the test or dev set. In contrast, if you split-by-lemma (where you take every inflection for a single lemma and put it in one of the splits) you can avoid overlap. This gives us a better idea of the accuracy of the code in a more applicable situation where the code only sees the lemma and the msd.(Goldman et al. 2022)

In [2]:
def hamming(s,t):
    return sum(1 for x,y in zip(s,t) if x != y)

In [3]:
def halign(s,t):
    """Align two strings by Hamming distance."""
    slen = len(s)
    tlen = len(t)
    minscore = len(s) + len(t) + 1
    for upad in range(0, len(t)+1):
        upper = '_' * upad + s + (len(t) - upad) * '_'
        lower = len(s) * '_' + t
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    for lpad in range(0, len(s)+1):
        upper = len(t) * '_' + s
        lower = (len(s) - lpad) * '_' + t + '_' * lpad
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    zipped = list(zip(bu,bl))
    newin  = ''.join(i for i,o in zipped if i != '_' or o != '_')
    newout = ''.join(o for i,o in zipped if i != '_' or o != '_')
    return newin, newout

In [4]:
def levenshtein(s, t, inscost = 1.0, delcost = 1.0, substcost = 1.0):
    """Recursive implementation of Levenshtein, with alignments returned."""
    @memolrec
    def lrec(spast, tpast, srem, trem, cost):
        if len(srem) == 0:
            return spast + len(trem) * '_', tpast + trem, '', '', cost + len(trem)
        if len(trem) == 0:
            return spast + srem, tpast + len(srem) * '_', '', '', cost + len(srem)

        addcost = 0
        if srem[0] != trem[0]:
            addcost = substcost

        return min((lrec(spast + srem[0], tpast + trem[0], srem[1:], trem[1:], cost + addcost),
                   lrec(spast + '_', tpast + trem[0], srem, trem[1:], cost + inscost),
                   lrec(spast + srem[0], tpast + '_', srem[1:], trem, cost + delcost)),
                   key = lambda x: x[4])

    answer = lrec('', '', s, t, 0)
    return answer[0],answer[1],answer[4]

In [5]:
def memolrec(func):
    """Memoizer for Levenshtein."""
    cache = {}
    @wraps(func)
    def wrap(sp, tp, sr, tr, cost):
        if (sr,tr) not in cache:
            res = func(sp, tp, sr, tr, cost)
            cache[(sr,tr)] = (res[0][len(sp):], res[1][len(tp):], res[4] - cost)
        return sp + cache[(sr,tr)][0], tp + cache[(sr,tr)][1], '', '', cost + cache[(sr,tr)][2]
    return wrap

In [6]:
def alignprs(lemma, form):
    """Break lemma/form into three parts:
    IN:  1 | 2 | 3
    OUT: 4 | 5 | 6
    1/4 are assumed to be prefixes, 2/5 the stem, and 3/6 a suffix.
    1/4 and 3/6 may be empty.
    """

    al = levenshtein(lemma, form, substcost = 1.1) # Force preference of 0:x or x:0 by 1.1 cost
    alemma, aform = al[0], al[1]
    # leading spaces
    lspace = max(len(alemma) - len(alemma.lstrip('_')), len(aform) - len(aform.lstrip('_')))
    # trailing spaces
    tspace = max(len(alemma[::-1]) - len(alemma[::-1].lstrip('_')), len(aform[::-1]) - len(aform[::-1].lstrip('_')))
    return alemma[0:lspace], alemma[lspace:len(alemma)-tspace], alemma[len(alemma)-tspace:], aform[0:lspace], aform[lspace:len(alemma)-tspace], aform[len(alemma)-tspace:]


In [7]:
def prefix_suffix_rules_get(lemma, form):
    """Extract a number of suffix-change and prefix-change rules
    based on a given example lemma+inflected form."""
    lp,lr,ls,fp,fr,fs = alignprs(lemma, form) # Get six parts, three for in three for out

    # Suffix rules
    ins  = lr + ls + ">"
    outs = fr + fs + ">"
    srules = set()
    for i in range(min(len(ins), len(outs))):
        srules.add((ins[i:], outs[i:]))
    srules = {(x[0].replace('_',''), x[1].replace('_','')) for x in srules}

    # Prefix rules
    prules = set()
    if len(lp) >= 0 or len(fp) >= 0:
        inp = "<" + lp
        outp = "<" + fp
        for i in range(0,len(fr)):
            prules.add((inp + fr[:i],outp + fr[:i]))
            prules = {(x[0].replace('_',''), x[1].replace('_','')) for x in prules}

    return prules, srules

In [8]:
def apply_best_rule(lemma, msd, allprules, allsrules):
    """Applies the longest-matching suffix-changing rule given an input
    form and the MSD. Length ties in suffix rules are broken by frequency.
    For prefix-changing rules, only the most frequent rule is chosen."""

    bestrulelen = 0
    base = "<" + lemma + ">"
    if msd not in allprules and msd not in allsrules:
        return lemma # Haven't seen this inflection, so bail out

    if msd in allsrules:
        applicablerules = [(x[0],x[1],y) for x,y in allsrules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (len(x[0]), x[2], len(x[1])))
            base = base.replace(bestrule[0], bestrule[1])

    if msd in allprules:
        applicablerules = [(x[0],x[1],y) for x,y in allprules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (x[2]))
            base = base.replace(bestrule[0], bestrule[1])

    base = base.replace('<', '')
    base = base.replace('>', '')
    return base

In [9]:
def numleadingsyms(s, symbol):
    return len(s) - len(s.lstrip(symbol))


def numtrailingsyms(s, symbol):
    return len(s) - len(s.rstrip(symbol))
    

### A1. Creating splits by Lemma using pandas.DataFrame
At first Emily was having issues remembering the column names, so we named the Lemma, Inflection, and Inflected columns Lemon, Infection, and Infected respectively.  We also added a 4th column, PartoSpeech, to make splitting the lemmas by part of speech easier.

We started by getting rid of all the rows with duplicate lemmas. Then, we sampled a number of each part of speech(Noun, Proper Nouns, Verb, Participle, Adjective) such that when you get the inflected for each lemma, there will be a similar number of inflected for each part of speech.
We also took this as chance to replace the original lemmas of verbs, which were the 1st principle parts of verb, with the present active Infinitive, which provides more information regarding the conjugation of verbs.

In [10]:
# Get the input file as an array so I can skip over entries with +s
latPath = os.path.join('..', 'Latin_stuff', 'lat.txt')
lines = []
with open(latPath) as f:
    lines = [line.rstrip('\n') for line in f]
    
# Read in data as dataframe
# Note: Lemon = lemmas, Infection = inflections, and Infected = inflected
lat = pd.read_table(latPath, sep='\t', names=['Lemon', 'Infection', 'Infected'], skiprows=lambda x: '+' in lines[x])
print(lat.shape)

(815989, 3)


In [11]:
# get a list of the present active infinitives so that they can be used as the verb lemmas
lat['PartoSpeech'] = lat['Infection'].str.extract(r'(N|PROPN|V|V.PTCP|ADJ);')
infinitives = lat[lat['Infection'] == 'V;NFIN;ACT;PRS']

In [None]:
def switch_verb_lemma(lemma):
    inf = infinitives[infinitives["Lemon"]==lemma]["Infected"].iloc[0]
    return inf

lat.loc[lat["Infection"].str.startswith("V;"),"Lemon"]=lat[lat["Infection"].str.startswith("V;")]["Lemon"].map(switch_verb_lemma)

Unnamed: 0,Lemon,Infection,Infected,PartoSpeech
213333,imitārī,V;IND;ACT;PRS;1;SG,imitor,V
213334,imitārī,V;IND;ACT;PRS;2;SG,imitāris,V
213335,imitārī,V;IND;ACT;PRS;2;SG,imitāre,V
213336,imitārī,V;IND;ACT;PRS;3;SG,imitātur,V
213337,imitārī,V;IND;ACT;PRS;1;PL,imitāmur,V


In [None]:

# Create a list of unique lemmas to use for the splits, to avoid the same lemma in training and test
uniqueLemmas = lat.drop_duplicates(subset = ['Lemon', 'PartoSpeech'])
# Split the unique lemmas into dataframes by part of speech while also cutting it down using numbers I calculated elsewhere
# This variable is to make sure the data gets split/sampled the same way everytime, making it a variable for ease
rand = 34
partSample = uniqueLemmas[uniqueLemmas['PartoSpeech'] == 'V.PTCP'].sample(n=112, random_state=rand)
adjSample = uniqueLemmas[uniqueLemmas['PartoSpeech'] == 'ADJ'].sample(n=105, random_state=rand)
nounSample = uniqueLemmas[uniqueLemmas['PartoSpeech'] == 'N'].sample(n=185, random_state=rand)
verbSample = uniqueLemmas[uniqueLemmas['PartoSpeech'] == 'V'].sample(n=41, random_state=rand)
propSample = uniqueLemmas[uniqueLemmas['PartoSpeech'] == 'PROPN'].sample(n=343, random_state=rand)
print(verbSample.shape)

            Lemon           Infection Infected PartoSpeech
213333    imitārī  V;IND;ACT;PRS;1;SG   imitor           V
213387       cīre  V;IND;ACT;PRS;1;SG      ciō           V
213472  aurēscere  V;IND;ACT;PRS;1;SG  aurēscō           V
213514     senēre  V;IND;ACT;PRS;1;SG    seneō           V
213598      vēscī  V;IND;ACT;PRS;1;SG   vēscor           V
(41, 4)


In [16]:
# Split the dataframes randomly into train, test, and dev sets in a 10:1:1 ratio
partTrain, partTest = train_test_split(partSample, test_size=2000/12000, random_state=rand)
partTest, partDev = train_test_split(partTest, test_size=0.5, random_state=rand)

adjTrain, adjTest = train_test_split(adjSample, test_size=2000/12000, random_state=rand)
adjTest, adjDev = train_test_split(adjTest, test_size=0.5, random_state=rand)

nounTrain, nounTest = train_test_split(nounSample, test_size=2000/12000, random_state=rand)
nounTest, nounDev = train_test_split(nounTest, test_size=0.5, random_state=rand)

verbTrain, verbTest = train_test_split(verbSample, test_size=2000/12000, random_state=rand)
verbTest, verbDev = train_test_split(verbTest, test_size=0.5, random_state=rand)

propTrain, propTest = train_test_split(propSample, test_size=2000/12000, random_state=rand)
propTest, propDev = train_test_split(propTest, test_size=0.5, random_state=rand)

print(verbTrain.shape, verbTest.shape, verbDev.shape)

(34, 4) (3, 4) (4, 4)


In [17]:
# Concatenate the dataframes together to get all the unique lemmas in the test, train, and dev sets
uniqueTrain = pd.concat([partTrain, adjTrain, nounTrain, verbTrain, propTrain])
uniqueTest = pd.concat([partTest, adjTest, nounTest, verbTest, propTest])
uniqueDev = pd.concat([partDev, adjDev, nounDev, verbDev, propDev])

In [18]:
# Method to get all of the other lemmas which match with the lemmas in the unique list and return it as a dataframe
def getlist(lemons):
    splitslist = []
    for lemon in lemons['Lemon']:
        for row in lat[lat['Lemon'] == lemon].to_numpy().tolist():
            splitslist.append(row)
    return pd.DataFrame(splitslist, columns= ["Lemon", "Infection", "Infected", "PartoSpeech"])

# Call the method to get the full train, test, and dev sets
train = getlist(uniqueTrain)
test = getlist(uniqueTest)
dev = getlist(uniqueDev)

In [19]:
# Print the sizes of the train, test, and dev sets to check that they look right
print("Train set size", train.shape)
print("Test set size", test.shape)
print("Dev set size", dev.shape)

# Print the number of rows per part of speech in the training set to check that they look right
print("Train set split into parts of speech")
print(train[train['PartoSpeech'] == 'V.PTCP'].shape)
print(train[train['PartoSpeech'] == 'ADJ'].shape)
print(train[train['PartoSpeech'] == 'N'].shape)
print(train[train['PartoSpeech'] == 'V'].shape)
print(train[train['PartoSpeech'] == 'PROPN'].shape)

# Print the number of rows per part of speech in the test set to check that they look right
print("Test set split into parts of speech")
print(test[test['PartoSpeech'] == 'V.PTCP'].shape)
print(test[test['PartoSpeech'] == 'ADJ'].shape)
print(test[test['PartoSpeech'] == 'N'].shape)
print(test[test['PartoSpeech'] == 'V'].shape)
print(test[test['PartoSpeech'] == 'PROPN'].shape)

# Print the number of rows per part of speech in the dev set to check that they look right
print("Dev set split into parts of speech")
print(dev[dev['PartoSpeech'] == 'V.PTCP'].shape)
print(dev[dev['PartoSpeech'] == 'ADJ'].shape)
print(dev[dev['PartoSpeech'] == 'N'].shape)
print(dev[dev['PartoSpeech'] == 'V'].shape)
print(dev[dev['PartoSpeech'] == 'PROPN'].shape)

Train set size (9893, 4)
Test set size (943, 4)
Dev set size (1138, 4)
Train set split into parts of speech
(2236, 4)
(1955, 4)
(1926, 4)
(1819, 4)
(1957, 4)
Test set split into parts of speech
(180, 4)
(184, 4)
(182, 4)
(180, 4)
(217, 4)
Dev set split into parts of speech
(208, 4)
(262, 4)
(229, 4)
(233, 4)
(206, 4)


In [20]:
# Write a method to convert the dataframes to files based on Emily's code
def toFile(frame, fileName, fileType):
    frame.to_csv(path_or_buf= '../Latin_stuff/' + fileName + fileType,sep= "\t", encoding= "utf8", index= False, header=False, columns= ["Lemon", "Infection", "Infected"])

# Convert the test, train, and dev sets to files
toFile(train, 'lat', '.trn')
toFile(test, 'lat', '.tst')
toFile(dev, 'lat', '.dev')

## NOW WE CAN TRAIN OUR THINGY! WOOOOO! 🎉

## B. Editing nonneural
When we first got the nonneural code, it wasn't very efficient and very time consuming to run(runtime of O(n^3)), so we used pickle to store data, so after the code runs for the first time, all subsequent runs of the same splits are a lot faster.

In [21]:
#Finds rules from training data
prefbias,suffbias = 0,1
allprules, allsrules = {}, {}
trnPath = os.path.join('..', 'Latin_stuff', 'lat.trn')
lines = [line.strip() for line in open(trnPath, "r", encoding='utf8') if line != '\n']
for l in tqdm.tqdm(lines): # Read in lines and extract transformation rules from pairs
            lemma, msd, form = l.split(u'\t')
            if prefbias > suffbias:
                lemma = lemma[::-1]
                form = form[::-1]
            prules, srules = prefix_suffix_rules_get(lemma, form)

            if msd not in allprules and len(prules) > 0:
                allprules[msd] = {}
            if msd not in allsrules and len(srules) > 0:
                allsrules[msd] = {}

            for r in prules:
                if (r[0],r[1]) in allprules[msd]:
                    allprules[msd][(r[0],r[1])] = allprules[msd][(r[0],r[1])] + 1
                else:
                    allprules[msd][(r[0],r[1])] = 1

            for r in srules:
                if (r[0],r[1]) in allsrules[msd]:
                    allsrules[msd][(r[0],r[1])] = allsrules[msd][(r[0],r[1])] + 1
                else:
                    allsrules[msd][(r[0],r[1])] = 1



100%|██████████| 9893/9893 [00:04<00:00, 2030.45it/s]


In [22]:
#now we apply the rules we found to the dev set!
devPath = os.path.join('..', 'Latin_stuff', 'lat.dev')
devlines = [line.strip() for line in open(devPath, "r", encoding='utf8') if line != '\n']
numcorrect = 0
numguesses = 0
for l in tqdm.tqdm(devlines):
    lemma, msd, correct = l.split(u'\t')
#                    lemma, msd, = l.split(u'\t')
    if prefbias > suffbias:
        lemma = lemma[::-1]
    outform = apply_best_rule(lemma, msd, allprules, allsrules)
    if prefbias > suffbias:
        outform = outform[::-1]
        lemma = lemma[::-1]
    if outform == correct:
        numcorrect += 1
    numguesses += 1
print(f"acc: {numcorrect/numguesses}")

100%|██████████| 1138/1138 [00:00<00:00, 6455.86it/s]

acc: 0.9112478031634447





# DAMN. (⊙ˍ⊙)
## C. Results
When we originally ran the splits without changing the lemmas, we got an average accuracy of 83%. 

As you can see, with all the changes, we reached a average accuracy of 91%, which I think is pretty good score ¯\ _ (ツ) _/¯

ᓚᘏᗢ
### ヾ(￣▽￣)Bye~Bye~

(footnote from Nathan finishing this whilst eepy): (•_•) (°_°)

# Reading Citations:

Omer Goldman, David Guriel, and Reut Tsarfaty. 2022. (Un)solving Morphological Inflection: Lemma Overlap Artificially Inflates Models’ Performance. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 864–870, Dublin, Ireland. Association for Computational Linguistics.

Jordan Kodner, Sarah Payne, Salam Khalifa, and Zoey Liu. 2023. Morphological Inflection: A Reality Check. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6082–6101, Toronto, Canada. Association for Computational Linguistics.