In [1]:
import codecs
import re

In [2]:
# Use this class to store all cells.
class Column(object):
    
    def __init__(self, items=None):
        items = items or list()
        self._list = items
        
    def __len__(self):
        return len(self._list)
    
    def append(self, stuff):
        assert isinstance(stuff, str) or isinstance(stuff, unicode)
        idx = len(self)
        self._list.append((idx, stuff))
    
    def __iter__(self):
        for k, v in self._list:
            yield v
    
    def replace(self, orig, new):
        return Column(items=[(k, v.replace(orig, new)) for k, v in self._list])
    
    def filter(self, func):
        return Column(items=[(k, v) for k, v in self._list if func(v)])
    
    def remove(self, c):
        return self.filter(lambda l: c not in l)
    
    def clean(self):
        return self.filter(lambda l: l != '')
    
    def lower(self):
        return Column(items=[(k, v.lower()) for k, v in self._list])
    
    def sub(self, pattern, repl):
        return Column(items=[(k, re.sub(pattern, repl, v.strip())) for k, v in self._list])
        

In [3]:
# First, split contents into cells
def get_cells(input_path):
    contents = codecs.open(input_path, 'r', 'utf8').read(-1)
    split_contents = Column()
    multiline = False
    still_on_the_same_cell = False # This marks whether we have entered another line AFTER one multiline cell.
    str_buffer = ''
    
    for c in contents:
        if c == '"':
            if multiline: # It has reached the end of a cell
                multiline = False
                still_on_the_same_cell = True
                split_contents.append(str_buffer)
                str_buffer = ''
            else: # Start of a cell
                multiline = True
        elif c == '\n':
            if multiline: # Newline within the same cell
                str_buffer += '\n' # Replace newline with space
            elif str_buffer == '':
                if still_on_the_same_cell: # This means we just finished a multiline cell buffer
                    still_on_the_same_cell = False
                    pass
                else: # This means it's an empty cell
                    split_contents.append(str_buffer)
            else: # This marks the end of a non-multiline cell
                split_contents.append(str_buffer)
                str_buffer = ''
        else:
            str_buffer += c
    print len(split_contents), 'cells'
    ret = split_contents.clean()
    print len(ret), 'cells after cleaning'
    return ret

In [4]:
# Get all characters (including special characters in this column)
def get_charset(contents):
    charset = set()
    for cell in contents:
        charset.update(cell)
    return ''.join(sorted(charset))

In [5]:
# Lines containing some given characters
def get_lines(chars, contents):
    chars = set(chars)
    ret = list()
    for l in contents:
        if chars & set(l):
            ret.append(l)
    print len(ret), 'occurrences'
    return ret

In [23]:
minoan = get_cells('../data/linear_b.minoan.raw')

1013 cells
997 cells after cleaning


In [24]:
minoan = minoan.sub('\n[ ]+', '/').replace('\n', '/').sub('[ ]+/', '/').sub(r'[ ]$', '').replace(' ', '-') # Get rid of whitespaces
# Remove some irregularites with hyphens
to_remove = [u'a-ne-mo(i-e-re-ja)', u'e-ra-po(ri-me-ne)', u'(po-ti-ni-ja)i-qe-a', u'ma-te(-re)te-i-ja', u'ra-pa-to(me-no)']
minoan = minoan.filter(lambda l: l not in to_remove)
minoan = minoan.replace('(', '').replace(')', '') # remove all parentheses
minoan = minoan.remove('*') # Remove lines containing *
minoan = minoan.replace('[', '').replace(']', '') # Remove all brackets
minoan = minoan.replace(u'\u03bf', 'o') # I think u'\u03bf' is just 'o'
minoan = minoan.replace('ai', 'a3') # ai -> a3
# Remove instances containing 'lo', 'pa3', and 'sa2'
minoan = minoan.filter(lambda l: '-lo' not in l and '-pa3' not in l and '-sa2' not in l)
minoan = minoan.filter(lambda l: not l.endswith('-') and not l.startswith('-')) # Remove trailing hyphens (most likely affixes)
minoan = minoan.sub(r'/+', '/')

In [25]:
print get_charset(minoan)

-/23adeijkmnopqrstuwz


In [26]:
len(minoan)

972

# Explore Greek data

In [27]:
def remove_mix(c):
    return lambda l, c=c: c not in l or (l[0] != '(' and l[0] != u'‘')

In [28]:
greek = get_cells('../data/linear_b.el.raw')

1013 cells
991 cells after cleaning


In [29]:
greek = greek.lower() # Try to lowercase everything
# Remove lines that contain both transliterations of linear b script (starting with parentheses or "‘") and the Latin letter a
greek = greek.filter(remove_mix('a'))
greek = greek.replace('a', u'α') # Replace a's with α's
# Remove lines that contain both transliterations of linear b script (within parentheses and "‘") and the Latin letter d e
greek = greek.filter(remove_mix('d'))
greek = greek.filter(remove_mix('e'))
# Remove two messy instances
to_remove = [u'σειρημοκαράφι/  σειρημοκαράο(ρε)ι  (‘seremo’=> σειρήνες;)', u'ιπ(π)ο(e-qe)']
greek = greek.filter(lambda l: l not in to_remove)
greek = greek.replace('e', u'ε') # Replace 'e' with 'ε'
greek = greek.filter(remove_mix('i')) # Remove both i instances
greek = greek.replace('k', u'κ') # Replace k with κ
greek = greek.replace('m', u'μ') # Replace m with μ
greek = greek.replace('n', u'ν') # Replace n with ν
greek = greek.replace('o', u'ο') # Replace o with ο

# Get rid of diacritics
repl = {
    u'ΐ': u'ι',
    u'ά': u'α',
    u'έ': u'ε',
    u'ή': u'η',
    u'ί': u'ι',
    u'ϊ': u'ι',
    u'ό': u'ο',
    u'ύ': u'υ',
    u'ώ': u'ω',
    u'‘': u'',
    u'’': u''
}

for k, v in repl.items():
    greek = greek.replace(k, v)

greek = greek.remove('*') # Remove lines containing *
greek = greek.remove('+') # Remove the line containing +
greek = greek.replace('>', '/')
greek = greek.replace(' (', '/').replace('(', '').replace(')', '') # Get rid of parentheses
greek = greek.replace(',', '/')
greek = greek.replace(u'δυικ.', '')
greek = greek.replace(u';', '')
greek = greek.remove('-')
greek = greek.remove('=')
greek = greek.replace(u'?', '')
greek = greek.replace(' ', '')
greek = greek.sub('\n[ ]+', '/').replace('\n', '/').sub('[ ]+/', '/').sub(r'[ ]$', '').replace(' ', '-') # get rid of whitespace
greek = greek.sub(r'/+', '/')

In [30]:
print get_charset(greek)

/fhyαβγδεζηθικλμνξοπρςστυφχψω


In [31]:
len(greek)

949

# Join columns

In [32]:
def pp(lines):
    if isinstance(lines, (list, tuple, dict)):
        for l in lines:
            pp(l)
    else: 
        print lines

In [33]:
def join(col1, col2):
    ret = list()
    for k1, v1 in col1._list:
        for k2, v2 in col2._list:
            if k1 == k2:
                ret.append((k1, v1, v2))
    return ret

In [34]:
pairs = join(minoan, greek)

# To get rid of slashes, only take the first one for linear B script, but keep everything for Greek

In [35]:
cognates = list()
for i, lb, g in pairs:
    lb = lb.split('/')[0].strip()
    g = [x.strip() for x in g.split("/")]
    cognates.append((lb, g))

In [36]:
len(cognates)

935

In [37]:
all_syls = set()
for lb, g in cognates:
    syls = lb.split("-")
    if 'sa2' in syls:
        print lb
        pp(g)
        break
    all_syls.update(syls)

In [38]:
def revert(l):
    return ''.join(map(lambda s: syl2linb[s], l.split('-')))

In [41]:
linb2syl = {
    u'𐀀': 'a', u'𐀁': 'e', u'𐀂': 'i', u'𐀃': 'o', u'𐀄': 'u', u'𐀅': 'da', u'𐀆': 'de', 
    u'𐀇': 'di', u'𐀈': 'do', u'𐀉': 'du', u'𐀊': 'ja', u'𐀋': 'je', u'𐀍': 'jo', 
    u'𐀎': 'ju', u'𐀏': 'ka', u'𐀐': 'ke', u'𐀑': 'ki', u'𐀒': 'ko', u'𐀓': 'ku', 
    u'𐀔': 'ma', u'𐀕': 'me', u'𐀖': 'mi', u'𐀗': 'mo', u'𐀘': 'mu', u'𐀙': 'na', 
    u'𐀚': 'ne', u'𐀛': 'ni', u'𐀜': 'no', u'𐀝': 'nu', u'𐀞': 'pa', u'𐀟': 'pe', 
    u'𐀠': 'pi', u'𐀡': 'po', u'𐀢': 'pu', u'𐀣': 'qa', u'𐀤': 'qe', u'𐀥': 'qi', 
    u'𐀦': 'qo', u'𐀨': 'ra', u'𐀩': 're', u'𐀪': 'ri', u'𐀫': 'ro', u'𐀬': 'ru',
    u'𐀭': 'sa', u'𐀮': 'se', u'𐀯': 'si', u'𐀰': 'so', u'𐀱': 'su', u'𐀲': 'ta', 
    u'𐀳': 'te', u'𐀴': 'ti', u'𐀵': 'to', u'𐀶': 'tu', u'𐀷': 'wa', u'𐀸': 'we', 
    u'𐀹': 'wi', u'𐀺': 'wo', u'𐀼': 'za', u'𐀽': 'ze', u'𐀿': 'zo', u'𐁀': 'a2', 
    u'𐁁': 'a3', u'𐁂': 'au', u'𐁃': 'dwe', u'𐁄': 'dwo', u'𐁅': 'nwa', u'𐁆': 'pu2', 
    u'𐁇': 'pte', u'𐁈': 'ra2', u'𐁉': 'ra3', u'𐁊': 'ro2', u'𐁋': 'ta2', u'𐁌': 'twe', u'𐁍': 'two'
}

In [42]:
syl2linb = {v: k for k, v in linb2syl.items()}

In [43]:
with codecs.open('../data/linear_b-greek.cognates', 'w', 'utf8') as fout:
    for lb, g in cognates:
        fout.write('%s\t%s\t%s\n' %(revert(lb), lb, '\t'.join(g)))

In [44]:
len(all_syls)

70

In [45]:
print unichr(65537)

𐀁


In [46]:
assert set(syl2linb.keys()) >= all_syls

# merge columns

In [69]:
from collections import defaultdict

In [70]:
# merge minoan scripts
min2el = defaultdict(list)
for lb, g in cognates:
    m = revert(lb)
    min2el[m].extend(g)

In [71]:
len(min2el)

919

In [79]:
pp(min2el[u'𐀒𐀜𐀰'])

κνωσος
κνωσονδε


In [559]:
with codecs.open('../data/linear_b_minoan-greek.cognates', 'w', 'utf8') as fout:
    for k in sorted(min2el.keys()):
        fout.write('%s\t%s\n' %(k, '\t'.join(min2el[k])))

In [561]:
# merge latin scripts (with hyphens)
lat2el = defaultdict(list)
for lb, g in cognates:
    lat2el[lb].extend(g)

In [562]:
len(lat2el)

919

In [571]:
lat2el.keys()[:10]

[u'ko-no-si-ja',
 u'a-ka-si-jo-ne',
 u'a-ka-wi-ja-de',
 u'te-mi-ti-ja',
 u'e-ra-ja',
 u'pe-re',
 u'me-ta-pi-jo',
 u'pa-ra-jo',
 u'qe-to-ro-po-pi',
 u'me-de-i-jo']

In [565]:
with codecs.open('../data/linear_b_latin-greek.cognates', 'w', 'utf8') as fout:
    for k in sorted(lat2el.keys()):
        fout.write('%s\t%s\n' %(k, '\t'.join(lat2el[k])))


In [566]:
# merge latin scripts (without hyphens)
latno2el = defaultdict(list)
for lb, g in cognates:
    latno2el[lb.replace('-', '')].extend(g)

In [567]:
len(latno2el)

919

In [568]:
with codecs.open('../data/linear_b_latin_no_hyp-greek.cognates', 'w', 'utf8') as fout:
    for k in sorted(latno2el.keys()):
        fout.write('%s\t%s\n' %(k, '\t'.join(latno2el[k])))


In [570]:
latno2el.keys()[:10]

[u'kapinija',
 u'menijo',
 u'teodora',
 u'komaweta',
 u'anono',
 u'komaweteja',
 u'dikonaro',
 u'kokijo',
 u'arakatejao',
 u'esareu']

In [590]:
# min2latin
min2lat = defaultdict(list)
for lb, g in cognates:
    m = revert(lb)
    min2lat[m].append(lb.replace('-', ''))
    
with codecs.open('../data/linear_b_minoan-latin_no_hyp.cognates', 'w', 'utf8') as fout:
    for k in sorted(min2lat.keys()):
        fout.write('%s\t%s\n' %(k, '\t'.join(min2lat[k])))

# names only

In [48]:
import numpy as np

In [49]:
is_name = [1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1]
is_name = np.asarray(is_name).astype('bool')

In [51]:
names_cognates = list()
for i, lb, g in pairs:
    if is_name[i]:
        lb = lb.split('/')[0].strip()
        g = [x.strip() for x in g.split("/")]
        names_cognates.append((lb, g))

In [53]:
len(names_cognates)

456

In [55]:
# merge minoan scripts
names_min2el = defaultdict(list)
for lb, g in names_cognates:
    m = revert(lb)
    names_min2el[m].extend(g)

In [57]:
len(names_min2el)

455

In [60]:
with codecs.open('../data/linear_b_minoan-greek.names.cognates', 'w', 'utf8') as fout:
    for k in sorted(names_min2el.keys()):
        fout.write('%s\t%s\n' %(k, '\t'.join(names_min2el[k])))

In [63]:
with codecs.open('../data/linear_b.minoan.names', 'w', 'utf8') as fout:
    for k in sorted(names_min2el.keys()):
        fout.write('%s\n' %(k))

In [66]:
with codecs.open('../data/linear_b.greek.names', 'w', 'utf8') as fout:
    for k in sorted(names_min2el.keys()):
        for kk in names_min2el[k]:
            fout.write('%s\n' %(kk))