In [None]:
!git clone https://github.com/shuokabe/crf_glossing.git

In [None]:
import sklearn_crfsuite

import features as cgfeat
import majority_label as ml
import process_file as cgpf
import utils as utils

In [None]:
train_file = 'data/Tsez/ddo-train-track1-uncovered'
test_file = 'data/Tsez/ddo-test-track1-covered'

train = open(train_file, 'r').read()    # training data
test = open(test_file, 'r').read()      # test data

In [None]:
mukri_train_corpus = cgpf.IGT_Corpus(train, test=False)
mukri_test_corpus = cgpf.IGT_Corpus(test, test=True)

In [None]:
# this retrieved the most frequency 10 words from the training corpus
most_frequent_word = ml.get_most_frequent_words(mukri_train_corpus)
my_dictionary = dict(most_frequent_word)

In [None]:
# Converting the dataset into the CRF format
## use first line if no top 10 frequent words are provided, else the second line

#train_sents = mukri_train_corpus.convert_to_crf_format(stem=True)
train_sents = mukri_train_corpus.convert_to_crf_format(custom_dict=my_dictionary)

In [None]:
test_sents = mukri_test_corpus.convert_to_crf_format(stem=True)

In [None]:
# Prepare the data for training and testing

X_train = [cgfeat.sent2features(s) for s in train_sents]
y_train = [cgfeat.sent2labels(s) for s in train_sents]

X_test = [cgfeat.sent2features(s) for s in test_sents]

In [None]:
# Training the CRF model (default hyperparameters) - from CRF suite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

crf.fit(X_train, y_train)

In [None]:
# prediction for grammatical glosses
y_pred = crf.predict(X_test)

In [None]:
# Predicting the lexical glosses
majority_dictionary = ml.create_majority_dict(mukri_train_corpus)

y_pred = ml.apply_majority_label(y_pred, majority_dictionary, mukri_test_corpus)

In [None]:
# Use dictionary directly to predict glosses
maj_dic = ml.create_majority_dict(mukri_train_corpus)

y_pred = ml.apply_majority_label_direct(maj_dic, mukri_test_corpus)

In [None]:
gloss_sent_list = cgpf.convert_to_igt_format(y_pred)

In [None]:
# Saving the predictions in a text file
## Change file path here
output_path = 'prediction.txt'

with open(output_path, 'w', encoding='utf-8') as file:
    for gloss_sentence in gloss_sent_list:
        file.write('\\t' + '\n')
        file.write('\\m' + '\n')
        file.write('\\g ' + gloss_sentence + '\n')
        file.write('\\l' + '\n')
        file.write('\n')