# Astro Databases
This notebook is designed to extract data from various data sources. A lot of data sources are, thankfully, fixed-width which is super easy to parse. The idea being we will extract various datasets into a custom excel doc which will be the de-facto catalog for the app.

In [1]:
import math
import pandas

In [2]:
def n(input):
    try:
        return str(int(input))
    except:
        return ''
    
def v(input):
    try:
        return float(input)
    except:
        return 0.0
    
def ttd(hours, minutes, seconds):
    return hours + (minutes/60) + (seconds/3600)

def safestr(input):
    return input.replace("\n","").replace("|", "")

## Database Parsing

In [3]:
class DBEntry:
    ra_h = 0.0
    ra_m = 0.0
    ra_s = 0.0
    dec_deg = 0.0
    dec_m = 0.0
    dec_s = 0.0
    mag = 0.0
    names = ''
    designation = ''
    catalog_id = ''
    catalog = ''
    source = ''
    meta = {}
    
    def __init__(self, ra_h, ra_m, ra_s, dec_deg, dec_m, dec_s, names, designation, catalog_id, source, catalog='', mag=0.0, meta={}):
        self.ra_h = v(ra_h)
        self.ra_m = v(ra_m)
        self.ra_s = v(ra_s)
        self.dec_deg = v(dec_deg)
        self.dec_m = v(dec_m)
        self.dec_s = v(dec_s)
        self.names = names
        self.designation = designation
        self.catalog_id = catalog_id
        self.catalog = catalog
        self.source = source
        self.meta = meta
        self.mag = mag
        self.ra_decimal = ttd(self.ra_h, self.ra_m, self.ra_s)
        self.dec_decimal = self.dec_deg + ttd(0, self.dec_m, self.dec_s)
        self.ngc_id = self.ngc()
        self.m_id = self.m()
        
    def as_obj(self):
        return {
            'ra_h': self.ra_h,
            'ra_m': self.ra_m,
            'ra_s': self.ra_s,
            'dec_deg': self.dec_deg,
            'dec_m': self.dec_m,
            'dec_s': self.dec_s,
            'names': safestr(self.names),
            'designation': safestr(self.designation),
            'catalog_id': safestr(self.catalog_id),
            'catalog': safestr(self.catalog),
            'source': safestr(self.source),
            'mag': self.mag,
            'ngc_id': self.ngc_id,
            'meta': self.meta,
        }
    
    def ngc(self):
        if 'NGC' in self.meta:
            return n(self.meta['NGC'])
        else:
            return ''
    
    def m(self):
        if 'M' in self.meta:
            return n(self.meta['M'])
        else:
            return ''
    
    def rank(self, b):
        """This method is used to generate a score comparing object self to object b"""        
        # Compute raw distance
        score = math.sqrt(pow(b.ra_decimal - self.ra_decimal, 2) + pow(b.dec_decimal - self.dec_decimal, 2))
        if b.ngc_id == self.ngc_id and len(self.ngc_id) > 0:
            score -= 0.01
        
        if b.m_id == self.m_id and len(str(self.m_id)) > 0:
            score -= 0.01        
        
        return score
        

## SNRS (Supernova Remnant Catalog)

In [4]:
snrs_dataset = []
def parse(line):
    return DBEntry(
        ra_h = line[13:15],
        ra_m = line[16:19],
        ra_s = line[19:21],
        dec_deg = line[23:26],
        dec_m = line[27:29],
        dec_s = 0.0,
        names = safestr(line[62:88]),
        catalog = 'SNRS',
        source = 'ftp://cdsarc.u-strasbg.fr/0/cats/VII/284',
        designation = 'Supernova Remnant',
        catalog_id = line[0:11],
        meta = {
            'remnant_type': line[43:45],
        }
    )
    
with open('data/snrs/snrs.dat') as f:
    lines = f.readlines()
    for line in lines:
        snrs_dataset.append(
            parse(line)
        )

## Sac81 Deep Sky Objects

In [5]:
sac_dataset = []

def parse_ngc(ngc):
    if 'NGC' in ngc:
        ngc = str(ngc).replace('NGC ', '')
        ngc = ngc.replace(' ', '')
        return ngc
    else:
        return ''
    
def parse_m(otherName):
    if otherName.startswith('M'):
        otherName = otherName.replace('M', '')
        otherName = otherName.replace(' ', '')
        try:
            return int(otherName)
        except:
            return ''
    else:
        return ''
    

def parse(line):
    NGC_CANDIDATE = line[1:18]
    if 'NGC' in NGC_CANDIDATE:
        NGC = parse_ngc(NGC_CANDIDATE)
    else:
        NGC = ''
        
    return DBEntry(
        names = safestr(line[1:18] + ',' + line[19:37]),
        designation = line[38:43],
        ra_h = line[48:50],
        ra_m = line[51:55],
        ra_s = 0.0,
        dec_deg = line[56:60],
        dec_m = line[60:62],
        dec_s = 0.0,
        catalog_id = line[1:18],
        mag = line[63:67],
        catalog = 'Sac81',
        source = 'https://www.saguaroastro.org/sac-downloads/',
        meta = {
            'constellation': line[44:47],
            'surface_brightness': line[68:72],
            'classifiers': line[102:113],
            'descriptors': line[130:185],
            'notes': safestr(line[186:272]),
            'NGC': n(NGC),
            'M': parse_m(line[19:37]),
        }
    )
    
with open('data/sac81/data.txt') as f:
    lines = f.readlines()
    for line in lines[1:]:
        sac_dataset.append(parse(line))

## NGC Database

In [6]:
ngc_dataset = []

def parse(line):
    if len(line) < 10:
        return None
    
    paths = line.split(';')
    ras = paths[2].split(':')
    decs = paths[3].split(':')    
    
    if len(ras) < 3:
        return None
    
    if len(decs) < 3:
        return None
    
    return DBEntry(
        names = paths[23],
        catalog_id = paths[0],
        source = 'https://github.com/mattiaverga/OpenNGC',
        ra_h = ras[0],
        ra_m = ras[1],
        ra_s = ras[2],
        dec_deg = decs[0],
        dec_m = decs[1],
        dec_s = decs[2],
        catalog = 'OpenNGC',
        designation = paths[1],
        mag = paths[9],
        meta = {
            'surface_brightness': paths[12],
            'b_mag': paths[8],
            'v_mag': paths[9],
            'j_mag': paths[10],
            'M': paths[18],
            'NGC': n(paths[19]),
            'IC': paths[20],
        }
    )
    

with open('data/ngc/data.csv') as f:
    lines = f.readlines()
    for line in lines[1:]:
        candidate = parse(line)
        if (candidate != None):
            ngc_dataset.append(candidate)
            

# Yale bright star

In [7]:
yale_dataset = []

def compute_sign(sign):
    if sign == '-':
        return '-'
    else:
        return ''

def parse(line):
    return DBEntry(
        names = safestr(line[4:14]),
        catalog_id = line[0:4],
        source = 'ftp://cdsarc.u-strasbg.fr/0/cats/V/50',
        catalog = 'Yale Bright Star',
        designation = 'Bright Star',
        ra_h = line[75:77],
        ra_m = line[77:79],
        ra_s = line[79:83],
        dec_deg = compute_sign(line[83:84]) + line[84:86],
        dec_m = line[86:88],
        dec_s = line[88:90],
        mag = line[102:107],
        meta = {
            'HD_ID': line[25:31],
            'SAO_ID': line[31:37],
            'FK5_ID': line[37:41],
            'DM Ident': line[14:25],
            'v_mag': line[102:107],
            'bv_mag': line[109:114],
            'ub_mag': line[115:120],
        }
    )

with open('data/yale_bright/bsc5.dat') as f:
    lines = f.readlines()
    for line in lines:
        yale_dataset.append(parse(line))

## IAU named stars database

In [8]:
iau_dataset = []

def extract_number(line):
    line = line.strip()
    if ' ' in line:
        line = line[line.index(' ')::]
        
    return float(line)
        

def parse_iau(line):
    ra_deg = extract_number(line[101:112])
    ra_h = ra_deg / 15.0
    ra_m = (ra_h - math.floor(ra_h)) * 60
    ra_s = (ra_m - math.floor(ra_m)) * 60
        
    return DBEntry(
        catalog = 'IAU Star Database',
        source = 'http://www.pas.rochester.edu/~emamajek/WGSN/IAU-CSN.txt',
        designation = 'Named Star',
        catalog_id = line[36:49],
        names = safestr(line[0:18]),
        ra_h = math.floor(ra_h),
        ra_m = math.floor(ra_m),
        ra_s = math.floor(ra_s),
        dec_deg = line[112:123],
        dec_m = 0.0,
        dec_s = 0.0,
        mag = line[81:88],
        meta = {
            'constellation': line[61:66],
            'v_mag': line[81:88],
        }        
        
    )
    pass

with open('data/iau/db.dat', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        if not line.startswith('#'):
            iau_dataset.append(parse_iau(line))

## CSV Output Computations

In [9]:
combined = ngc_dataset + sac_dataset + snrs_dataset + yale_dataset + iau_dataset
print(str(len(combined)) + ' celestial objects')

34143 celestial objects


## Duplicate Ranking
The following code is incredibly inefficient but it'll run eventually. It ranks each object against each other object using an esoteric algorithm, looking for duplicates and assigning a distinct catalog id to them.

**WARNING** This takes a short while

In [10]:
dup_id = 0
ra_buckets = {}
for entry in combined:
    r = math.floor(entry.ra_decimal)
    if r not in ra_buckets:
        ra_buckets[r] = []
    ra_buckets[r].append(entry)
    
for entry in combined:
    if 'dup_id' in entry.meta:
        continue
    
    next_id = 0 + dup_id
    entry.meta['dup_id'] = next_id
    bucket = math.floor(entry.ra_decimal)
    for other in ra_buckets[bucket]:
        if 'dup_id' in other.meta:
            continue
            
        dist = entry.rank(other)
        if dist <= 0.0003:
            other.meta['dup_id'] = next_id

    dup_id = dup_id + 1


## Final CSV generation

In [11]:
SEP = '|'
CSV_HEADER = 'SEP=' + SEP + '\n'
CSV_DATA = ''

# determine headers
headers = {}
meta_headers = {}
header_list = []
meta_list = []

for entry in combined:
    for k in entry.as_obj():
        if k != 'meta':
            headers[k] = k
    for k in entry.meta:
        meta_headers[k] = k
        
for header in headers:
    header_list.append(header)
    
for header in meta_headers:
    meta_list.append(header)
    
# build header row
for header in header_list:
    CSV_DATA += header + SEP
for header in meta_list:
    CSV_DATA += header + SEP
CSV_DATA += '\r\n'
    
# build csv
for entry in combined:
    obj = entry.as_obj()
    for header in header_list:
        CSV_DATA += str(obj[header]) + SEP
    for header in meta_list:
        if header in entry.meta:
            CSV_DATA += str(entry.meta[header]) + SEP
        else:
            CSV_DATA += SEP
    CSV_DATA += '\r\n'

# Save output
with open('database.csv', '+w') as f:
    f.write(CSV_HEADER + CSV_DATA)
    
with open('db.dat', '+w') as f:
    f.write(CSV_DATA)