In [197]:
import re
dict_words = {}
count = 0
with open('data/dictionary.txt') as f:
    for word in f.readlines():
        word = word.replace('\n', '')
        word = word.lower()
        if 'ngc' == word:
            continue
        if 'ic' == word:
            continue
        if 'ugc' == word:
            continue
        if 'eso' == word:
            continue
        if 'mcg' == word:
            continue
        if 'pk' == word:
            continue
        dict_words[word] = True

In [198]:
SEP = '|' # This is the new delimiter character
headers = []
entries = []
objs = []

with open('db.dat') as f:
    lines = f.readlines()

    for header in lines[0].split(SEP):
        headers.append(header)

    for idx in range(1, len(lines[1:])):
        line = lines[idx]
        paths = line.split(SEP)
        obj = {}
        for col in range(0, len(paths) - 1):
            obj[headers[col]] = paths[col]
        objs.append(obj)

## Constellations

In [199]:
constellations = {}
constellations_4letter = {}
constellations_3letter = {}

with open('data/constellations.txt') as f:
    lines = f.readlines()
    for line in lines:
        cols = line.split('\t')
        name = cols[1]
        l3 = cols[3].lower()
        l4 = cols[4].lower()
        constellations[l4] = name
        constellations[l3] = name
        constellations_4letter[l4] = name
        constellations_3letter[l3] = name
        constellations[name] = name
        constellations[name.lower()] = name

def extract_constellation(txt):
    for constellation in constellations:
        if constellation in txt:
            return constellations[constellation]
    return ''
        
def normalize_constellations(txt):
    txt = re.sub(r'[^a-zA-Z\s\d]', '', txt)
    good_word_count = 0
    for word in txt.split(' '):
        if word.lower() in dict_words:
            good_word_count = good_word_count + 1
        
    words = {}
    result = []
    for word in txt.split(' '):
        # Do not duplicate words
        candidate = word
            
        # Special case for the common word "and"
        if word.lower() == "and" and good_word_count > 1:
            result.append(word)
            continue
        
        if word.lower() in constellations_4letter:
            candidate = constellations_4letter[word.lower()]
        elif word.lower() in constellations_3letter:
            candidate = constellations_3letter[word.lower()]
        elif word.lower() in constellations:
            candidate = constellations[word.lower()]
            
        
        if candidate in words:
            continue
        
        result.append(candidate)
        
        for w in candidate.split(' '):
            words[w] = w

    final_word = ' '.join(result).strip()    
    return final_word

## Duplicate grouping

In [200]:
groups = {}
for obj in objs:
    if obj['dup_id'] not in groups:
        groups[obj['dup_id']] = []
    groups[obj['dup_id']].append(obj)

## Normalize Designations

In [201]:
def normalize_designation(designation):
    if 'star' in designation.lower():
        return 'Star'
    
    if 'ass' in designation.lower():
        return 'Asterism'
    
    lookup = {
        'ASTER': 'Asterism',
        'BRTNB': 'Bright Nebula',
        'CL+NB': 'Cluster',
        'DRKNB': 'Dark Nebula',
        'GALCL': 'Galaxy Cluster',
        'GALXY': 'Galaxy',
        'GLOCL': 'Globular Cluster',
        'GX+DN': 'Nebula',
        'GX+GC': 'Globular Cluster',
        'G+C+N': 'Cluster',
        'LMCCN': 'Cluster',
        'LMCDN': 'Nebula',
        'LMCGC': 'Globular Cluster',
        'LMCOC': 'Open Cluster',
        'OPNCL': 'Open Cluster',
        'NONEX': '',
        'PLNNB': 'Nebula',
        'SMCCN': 'Cluster',
        'SMCDN': 'Nebula',
        'SMCGC': 'Globular Cluster',
        'SMCOC': 'Open Cluster',
        'SNREM': 'Supernova Remnant',
        'QUASR': 'Quasar',
        'G': 'Galaxy',
        'GGroup': 'Galaxy',
        'GPair': 'Galaxy',
        'Gtrpl': 'Galaxy',
        'OCl': 'Open Cluster',
        'GCl': 'Globular Cluster',
        'Cl+N': 'Cluster',
        'PN': 'Nebula',
        'HII': 'Nebula',
        'Neb': 'Nebula',
        'EmN': 'Nebula',
        'RfN': 'Nebula',
        'SNR': 'Supernova Remnant,'
        ''
    }
    
    if designation in lookup:
        return lookup[designation]
    
    return designation
        

## Name
This will take a comma-separated list of names and extract the best name

In [207]:
def count_upper_case(name):
    count = 0
    for c in name:
        if c.isupper():
            count += 1
    return count

def get_name_score(name):
    score = 0.0
    if len(name) == 0:
        return -100

    name = name.strip()

    words = {}
    for word in name.split(' '):
        # Penalize the same word repeated
        if word in words:
            score -= 0.25
        if word.lower() in dict_words:
            score += 0.51
        if word.lower() in constellations:
            score += 0.05
        if 'ngc' in word.lower():
            score += 0.05
        if '+' in word:
            score -= 0.05
        if '-' in word:
            score -= 0.05
            
        # Inherently penalize more words
        score -= 0.025
        
        words[word] = word
        # score += count_upper_case(word) * 0.02
    return score

def get_best_name(item_names):
    name_list = item_names.split(',')
    best_name = name_list[0]
    best_score = -100.0
    
    for name in name_list:
        name_score = get_name_score(name)
        if name_score > best_score:
            best_name = name.replace("\'", '')
            best_score = name_score
    
    best_name_words = []
    is_m = False
    
    for word in best_name.split(' '):
        if word == 'M':
            is_m = True
        
        if 'mcg' in word.lower():
            best_name_words.append(word.upper())
        elif word.lower() in dict_words:
            best_name_words.append(word.capitalize())
        else:
            best_name_words.append(word)
            
    if is_m:
        return ''.join(best_name_words)
    else:
        return ' '.join(best_name_words)
    
    

## Polyfill blanks

In [208]:
polyfill_attrs = ['mag', 'names', 'designation', 'ngc_id', 'surface_brightness', 'M', 'NGC', 'IC', 'constellation', 'classifiers', 'descriptors', 'notes', 'remnant_type', 'HD_ID', 'SAO_ID', 'FK5_ID', 'bv_mag', 'ub_mag', 'b_mag', 'v_mag', 'j_mag']
UNKNOWN_MAGNITUDE = '99.0'

def normalize_str(name):
    name = normalize_constellations(name)
    name = name.replace('   ', ' ')
    name = name.replace('  ', ' ')
    name = name.strip()
    return name

def polyfill(items):
    names = []
    for item in items:
        item_names_str = item['names']
        item_names_str = re.sub(r';', ',', item_names_str)
        full_name_candidate  = item_names_str.split(',')
        full_name_candidate.extend(item['FK5_ID'].split(','))
        name_list = ','.join(full_name_candidate).split(',')
        for idx in range(0, len(name_list)):
            name_list[idx] = normalize_str(name_list[idx])
        
        names.extend(name_list)
    
    # If this is an IAU star, just automatically make the name win
    for item in items:
        if item['catalog'] == 'IAU Star Database':
            names = [normalize_str(item['names'])]
    
    names = set(names)
    for attr in polyfill_attrs:
        non_empty_value = ''
        smallest = float('infinity')
        largest = -float('infinity')
        for item in items:
            if attr not in item:
                item[attr] = ''
                
            # Clean up some stuff
            if attr == 'constellation':
                item[attr] = normalize_constellations(item[attr])
            if attr == 'designation':
                item[attr] = normalize_designation(item[attr])

            if attr == 'names':
                item[attr] = ','.join(names)
            
            # Remove things that are known to be bad
            if attr == 'designation' and item[attr] == 'Dup':
                item[attr] = ''
            
            # If the magnitude is clearly wrong, reset it to the unknown value
            # (Note: some databases define a blank magnitude as either 74 or 99)
            # So the sweeping threshold of "greather than 70 is bad" seems reasonable
            try:
                if attr == 'mag' and float(item[attr]) >= 70.0:
                    item[attr] = UNKNOWN_MAGNITUDE
            except:
                item[attr] = UNKNOWN_MAGNITUDE
                pass
            
            v = str(item[attr]).strip()
            if len(v) > 0:
                non_empty_value = item[attr]
                try:
                    vnum = float(non_empty_value)
                    if smallest > vnum:
                        smallest = vnum
                    if largest < vnum:
                        largest = vnum
                except:
                    pass
        
        for item in items:    
            v = str(item[attr]).strip()
            if attr in ['surface_brightness', 'mag', 'b_mag', 'v_mag', 'j_mag', 'bv_mag', 'ub_mag'] and smallest != float('infinity'):
                item[attr] = smallest
            elif len(v) == 0:
                item[attr] = non_empty_value

        for item in items:
            # extract constallation from name, if missing
            if len(item['constellation'].strip()) == 0:
                item['constellation'] = extract_constellation(item['names'])
    
    final_name = get_best_name(items[0]['names'])
    for item in items:
        item['best_name'] = final_name

In [209]:
for groupId in groups:
    polyfill(groups[groupId])

In [210]:
# csv = 'SEP=|\nbest_name|'
csv = 'best_name|' + '|'.join(headers) # no need for trailing \n because it's in the header list for some reason

for groupId in groups:
    candidate = groups[groupId][0]
    
    # Always choose IAU Star Database
    for item in groups[groupId]:
        if item['catalog'] == 'IAU Star Database':
            candidate = item
    
    if len(candidate['best_name']) == 0:
        continue
    csv += candidate['best_name'] + '|'
    for header in headers:
        if '\n' in header:
            continue
        if header in candidate:
            csv += str(candidate[header])
        csv += '|'
    csv += '\n'
    

with open('stardb.pipe', '+w') as f:
    f.write(csv)

In [211]:
results = []

for groupId in groups:
    for candidate in groups[groupId]:
        done = False
        for word in candidate['best_name'].split(' '):
            if word.lower() in dict_words:
                if 'NGC' not in word:
                    results.append(candidate['best_name'])
                    done = True
                    break
        if done:
            break

print(f"Found {len(results)} results")
print("")
for word in results:
    print(word)

Found 4388 results

H Iii 207
omi Perseus Cloud
Barnards Merope Nebula
Flaming Star Nebula
Orion B
Gemini A
Sadr
H I 100
The Witch Head Nebula
Toby Jug Nebula
omi Vela Cluster
Browning
H Iii 596
Coddingtons Nebula
Tet Carina Cluster
Lam Centaurus Nebula
Rho Ophiuchus Nebula
22 Scorpius
H Iv 57
Star Queen
Small Sagittarius Star Cloud
Barnards Galaxy
Pelican Nebula
Cocoon Nebula
Bowtie Nebula
47 Tucana Cluster
Andromeda Galaxy
Sculptor Filament
Small Magellanic Cloud
Owl Cluster
Triangulum Galaxy
Little Dumbbell Nebula
H Persei Cluster
Chi Persei Cluster
MCG 2 7 54
Fornax Dwarf Cluster 3
Perseus A
Fornax A
Fornax B
Maia
Merope Nebula
California Nebula
Hinds Variable Nebula
Crab Nebula
Great Orion Nebula
The Running Man Nebula
Lower Sword
Upper Sword
Mairans Nebula
46Eps Orion
30 Dorado Cluster
3 Monoceros
Monkey Head Nebula
Rosette A
Rosette Nebula
Rosette B
Hubbles Nebula
Christmas Tree Cluster
Great Bird Cluster
Carolines Cluster
30Tau Canis Major
Eskimo Nebula
Bear Claw Nebula
19 Pupp