In [1]:
"""
Credit: 
    Min-set-cover solver: https://gist.github.com/marekyggdrasil/a8e63be8e34e000f2507bdb5e0755dda
"""

from util import read_lines, write_lines
from nltk import FreqDist
from collections import defaultdict
from dlx import DLX

In [2]:
lines = read_lines("word_freq_list_with_roots.txt")

all_roots = []
entries = []
roots_set = set()
for line in lines[:100]:
    split = line.split('\t')
    if len(split) == 4: # only keep words that actually have roots
        roots = split[-1].split(', ')
        if tuple(roots) in roots_set:
            continue # since we're only interested in root cover
        else:
            roots_set.add(tuple(roots))
        all_roots.extend(roots)
        split[-1] = roots
        entries.append(split)
        
print('Total number of lines left:', len(entries))

dist = dict(FreqDist(all_roots).most_common())
vocab_roots = list(dist.keys())
root2idx = {root: idx for idx, root in enumerate(vocab_roots)}

root2entries = defaultdict(list)
for entry in entries:
    roots = entry[-1]
    for root in roots:
        root2entries[root].append(entry)

# for root in dist:
#     print(root)
#     print("")
#     print(root2entries[root])
#     input("wait")

Done reading file word_freq_list_with_roots.txt
Total number of lines left: 27


In [3]:
def genInstance(labels, rows) :
    columns = []
    indices_l = {}
    for i in range(len(labels)) :
        label = labels[i]
        indices_l[label] = i
        columns.append(tuple([label,0]))
    return labels, rows, columns, indices_l

def solveInstance(instance) :
    labels, rows, columns, indices_l = instance
    instance = DLX(columns)
    indices = {}
    for l, i in zip(rows, range(len(rows))) :
        h = instance.appendRow(l, 'r'+str(i))
        indices[str(hash(tuple(sorted(l))))] = i
    sol = instance.solve()
    lst = list(sol)
    selected = []
    for i in lst[0] :
        l = instance.getRowList(i)
        l2 = [indices_l[label] for label in l]
        idx = indices[str(hash(tuple(sorted(l2))))]
        selected.append(idx)
    return selected

def printColumnsPerRow(instance, selected) :
    labels, rows, columns, indices_l = instance
    print('covered columns per selected row')
    for s in selected :
        A = []
        for z in rows[s-1] :
            c, _ = columns[z]
            A.append(c)
        print(s, A)

def printInstance(instance) :
    labels, rows, columns, indices_l = instance
    print('columns')
    print(labels)
    print('rows')
    print(rows)

In [4]:
labels = vocab_roots
# Empty lists are fine
rows = [
    [root2idx[root] 
     for root in entry[-1]]
    for entry in entries]

In [5]:
# labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
# rows = [[],[0,3,6],[0,3],[3,4,6],[2,4,5],[1,2,5,6],[1,6]]
instance = genInstance(labels, rows)
selected = solveInstance(instance)
# printInstance(instance)
# printColumnsPerRow(instance, selected)

In [6]:
def to_str(entry):
    str_entry = '\t'.join(entry[:(-1)])
    if entry[-1] == []:
        return str_entry
    else:
        str_roots = ', '.join(entry[-1])
        return f'{str_entry}\t{str_roots}'

In [7]:
print('Number of selected entries for min-set-cover', len(selected))
selected_entries = [entries[i] for i in selected]
selected_entries.sort(key=lambda x: int(x[0]))
str_selected_entries = list(map(to_str, selected_entries))

write_lines("selected_entries.txt", str_selected_entries)

26
Done writing to file selected_entries.txt.


In [8]:
# !cat selected_entries.txt

2	be	v	*bheue-
4	of	i	*apo-
5	a	a	*oi-no-
8	have	v	*kap-
10	it	p	*ko-
13	for	i	*per-
18	do	v	*dhe-
22	at	i	*ad-
23	but	c	*ambhi-
34	what	d	*kwo-
37	can	v	*gno-
39	get	v	*ghend-
45	make	v	*mag-
52	time	n	*da-
60	some	d	*sem-
67	see	v	*sed-
70	come	v	*gwa-
75	other	j	*al-
80	two	m	*dwo-
83	want	v	*eue-
84	way	n	*wegh-
90	day	n	*agh-, *dyeu-
93	no	a	*aiw-, *ne-
94	man	n	*man-, *men-, *ner-, *wi-ro-
98	give	v	*ghabh-
100	well	r	*wel-


In [9]:
# list(map(to_str, entries))

['2\tbe\tv\t*bheue-',
 '4\tof\ti\t*apo-',
 '5\ta\ta\t*oi-no-',
 '8\thave\tv\t*kap-',
 '10\tit\tp\t*ko-',
 '13\tfor\ti\t*per-',
 '18\tdo\tv\t*dhe-',
 '22\tat\ti\t*ad-',
 '23\tbut\tc\t*ambhi-',
 '28\tnot\tx\t*ne-',
 '34\twhat\td\t*kwo-',
 '37\tcan\tv\t*gno-',
 '39\tget\tv\t*ghend-',
 '45\tmake\tv\t*mag-',
 '52\ttime\tn\t*da-',
 '60\tsome\td\t*sem-',
 '67\tsee\tv\t*sed-',
 '70\tcome\tv\t*gwa-',
 '75\tother\tj\t*al-',
 '80\ttwo\tm\t*dwo-',
 '83\twant\tv\t*eue-',
 '84\tway\tn\t*wegh-',
 '90\tday\tn\t*agh-, *dyeu-',
 '93\tno\ta\t*aiw-, *ne-',
 '94\tman\tn\t*man-, *men-, *ner-, *wi-ro-',
 '98\tgive\tv\t*ghabh-',
 '100\twell\tr\t*wel-']