In [None]:
import re
import morfessor

# to create training data for morfessor. input text will be transformed into a list of single words

def createTrainingData(input_path):
    words = []

    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("\\t"):
                tokens = line[2:].strip().split()
                words.extend(tokens)
    return [(1.0, word) for word in words]


In [None]:
trainingData = createTrainingData("ntu-train-track1-uncovered")  # change the input file there

model = morfessor.BaselineModel()       
model.load_data(trainingData)
model.train_batch()         # train the morfessor model model

In [None]:
def segment(input_path, output_path):   # input file is the same as used for training
    output_lines = []

    # segment each word 
    with open(input_path, "r", encoding="utf-8") as f:        
        for line in f:
            output_lines.append(line.rstrip('\n'))
            if line.startswith("\\t"):
                sentence = line[2:].strip()     # remove \t
                segmented = []
                for token in sentence.split():
                    if len(token) == 1 or token == "...":       # in case of punctuation, no segmentation needed
                        segmented.append(token)
                        continue

                    clean = re.sub("[,\".]", "", token)         # remove punctuations after/before a word
                    morphs = model.viterbi_segment(clean)[0]
                    joined = "-".join(morphs)

                    if joined.endswith("-"):            # remove hyphens at the end
                        joined = joined[:-1]

                    if joined.startswith("'"):          # change letters to lower case

                        joined = "'" + joined[1:].lower()
                    else:
                        joined = joined.lower()

                    joined = joined.replace("--", "-")

                    segmented.append(joined)

                output_lines.append("\\m " + " ".join(segmented))

    #store output in a file
    with open(output_path, "w", encoding="utf-8") as f:
        for line in output_lines:
            f.write(line + "\n")

In [None]:
# method for generating training dataset for morfS

def replaceHyphen(input_path, output_path):             # input_path should be the output file from above
    with open(input_path, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    output = []
    i = 0
    while i < len(lines):
        line = lines[i]

        if line.startswith("\\t"):
            m_index = i + 1 if i + 1 < len(lines) and lines[i + 1].startswith("\\m") else None

            if m_index:
                m_line = lines[m_index].strip()

                modified_m = m_line.replace("-", " ")

                output.append(f"\\t {modified_m[2:].strip()}\n")

                i += 2
            else:
                output.append(line)
                i += 1
        else:
            output.append(line)
            i += 1

    with open(output_path, "w", encoding="utf-8") as outfile:
        outfile.writelines(output)