In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import astropy as ast
import pandas as pd
import sys

from astropy.coordinates import SkyCoord
from astropy.coordinates import ICRS, Galactic, FK4, FK5
from astropy.coordinates import Angle, Latitude, Longitude
import astropy.units as u

In [2]:
# progress meter for big loops
# note: progress must go from 0 to 100 because reasons

def progress_meter(progress):
    sys.stdout.write("\rloading... %.1f%%" % progress)
    sys.stdout.flush()

In [3]:
root = '/Users/alin/Documents/'

### check contents of oncdb

In [4]:
# shows which catalogs are in oncdb and how many sources they have

# filename of db to be used
onc_curr_name = 'test_cat.csv'

# ====

onc_curr = pd.read_csv(root + onc_curr_name, low_memory=False)

print onc_curr['catname'].value_counts()

ACS       3399
NICMOS    2116
WFPC2     1488
Name: catname, dtype: int64


### bash catalogs into right format

In [None]:
# catalogs only *need* to contain catID, _RAJ2000 (deg), _DEJ2000 (deg)
# repeat observations of a source (i.e. same catID) are compressed to a single entry by the groupby

# catalogs to be used, as a list of tuples -- ('catname', 'filename (csv)', 'catID column name')

cat_info = [('ACS', 'acs_full.csv', 'ONCacs'),\
            ('WFPC2', 'wfpc2_full.csv', 'ONCpc2'),\
            ('NICMOS', 'nicmos_full.csv', 'ONCnic3')]

# ====

cat_dict = dict()

for i in range(len(cat_info)):
    
    cat = pd.read_csv(root + cat_info[i][1])
    
    # use mean or median?
    cat_basic = cat[[cat_info[i][2],'_RAJ2000','_DEJ2000']].groupby(cat_info[i][2]).agg(lambda x: np.mean(x))
    
    cat_basic.insert(0,'catID', cat_basic.index)
    cat_basic.insert(0,'catname', cat_info[i][0])
    
    cat_dict[i] = cat_basic

new_cat = pd.concat(cat_dict, ignore_index=True)

# new_cat.insert(0,'oncID',np.nan)

print "new_cat has", len(new_cat), "sources"

### add new cat to db

In [None]:
# filename of existing db
onc_ex_name = 'test_build.csv'

# filename for updated db
onc_up_name = 'test_add.csv'

# ====

onc_ex = pd.read_csv(root + onc_ex_name)

print 'new_cat has', len(new_cat), 'entries'
print 'oncdb has', len(onc_ex), 'entries'

In [None]:
c_new = SkyCoord(new_cat['_RAJ2000'], new_cat['_DEJ2000'], unit='degree')
c_onc = SkyCoord(onc_ex['_RAJ2000'], onc_ex['_DEJ2000'], unit='degree')

cross_dist = pd.DataFrame()
self_dist = pd.DataFrame()

for k in range(len(c_new)):
    
    # sep between new_cat and existing oncdb sources
    sep_cross = c_onc.separation(c_new[k]).arcsecond
    
    # internal sep between new_cat sources
    sep_self = c_new.separation(c_new[k]).arcsecond
    
    cross_dist.loc[:,k] = sep_cross
    self_dist.loc[:,k] = sep_self
    
    progress_meter(k*100./len(c_new))

In [None]:
# join existing dist df with sep_cross & sep_self dfs
# this looks (and probably is) stupid, but it works for now...

nc_join = new_cat.rename(index = lambda x: (x + len(onc_ex)), inplace=False)

cd_join = cross_dist.rename(columns = lambda x: (x + len(onc_ex)), inplace=False)
sd_join = self_dist.rename(columns = lambda x: (x + len(onc_ex)), inplace=False)

sd_join.rename(index = lambda x: (x + len(onc_ex)), inplace=True)

temp1 = onc_ex.join(cd_join)
temp2 = nc_join.join(cd_join.transpose().join(sd_join))

temp1.columns = temp1.columns.astype(str)
temp2.columns = temp2.columns.astype(str)

# print (temp1.columns == temp2.columns).sum()

onc_up = pd.concat([temp1,temp2], ignore_index=True)

onc_up

In [None]:
onc_up.to_csv(root + onc_up_name, index=False)

### completely build db

In [None]:
# filename for new db
onc_build_name = 'test_cat.csv'

In [None]:
# generates df of all pairwise distances

# beware that this step takes a while -- e.g. 35 min with 7000 sources
# (can probably be parallelized, but I don't know how)
# consequently, advise against completely rebuilding db unless *really* necessary

c_new = SkyCoord(new_cat['_RAJ2000'], new_cat['_DEJ2000'], unit='degree')

build_dist = pd.DataFrame()

for k in range(len(c_new)):
    
    sep = c_new.separation(c_new[k]).arcsecond
    
    build_dist.loc[:,k] = sep
    
    progress_meter(k*100./len(c_new))

onc_build = pd.concat([new_cat, build_dist], axis=1)

onc_build

In [None]:
onc_build.to_csv(root + onc_build_name, index=False)

### group sources

In [10]:
# the db must be completely built (all catalogs added) before grouping sources

# filename of db to be used
onc_gs_name = 'test_cat.csv'

# filename for db with sources cross-matched and grouped
onc_out_name = 'onc_out.csv'

# distance for xmatch (arcsec)
dist_crit = 1.

# whether to restrict to unique, non-ambiguous matches
unique_only = False

# ====

onc_gs = pd.read_csv(root + onc_gs_name)

onc_gs.insert(0,'oncflag','')

onc_gs.insert(0,'oncID',np.nan)

num_cats = len(onc_gs['catname'].value_counts())

print 'oncdb has', len(onc_gs), 'sources in', num_cats, 'catalogs'

oncdb has 7003 sources in 3 catalogs


In [11]:
# new source numbering starts at highest ACS number + 1
new_source = max(onc_gs.loc[onc_gs['catname'] == 'ACS', 'catID'].values) + 1

exclude = set()

for k in range(len(onc_gs)):
    
    if k not in exclude:
        
        # find where dist < dist_crit
        m = onc_gs.loc[onc_gs[str(k)] < dist_crit]

        mindex = set(m[str(k)].index.tolist())

        mindex_updated = set(m[str(k)].index.tolist())

        mindex_same = False

        iter_count = 0

        # print 'initial', mindex

        # keep adding match values until no new values are added -- still in progress

        while mindex_same == False:
            for x in mindex:
                y = onc_gs.loc[onc_gs[str(x)] < dist_crit]

                yindex = set(y[str(x)].index.tolist())

                # print 'new', yindex

                mindex_updated.update(yindex)

            # print 'mindex', mindex
            # print 'updated', mindex_updated

            mindex_same = (mindex == mindex_updated)

            mindex.update(mindex_updated)

            iter_count += 1

        exclude.update(mindex)
        
        num_group = len(mindex)
        '''
        if iter_count > 1:
            onc_gs.loc[mindex,'oncflag'] += 'i'
        '''
        match = onc_gs.loc[mindex,['catname','catID']]

        # check for multiple sources in same catalog (any duplicates will flag as True)
        if match.duplicated(subset='catname',keep=False).any() == True:

            onc_gs.loc[mindex,'oncflag'] += 'd' + str(num_group)

            # if only looking for uniques, skip assigning a number and continue on to the next source
            if unique_only == True:
                continue

        # use ACS number if it exists -- if multiple, use lowest
        if ('ACS' in match['catname'].values) == True:            
            onc_gs.loc[mindex,'oncID'] = min(match.loc[match['catname'] == 'ACS','catID'].values)

        # otherwise give it a new number
        else:
            onc_gs.loc[mindex,'oncID'] = new_source
            new_source += 1

        progress_meter(k*100./len(onc_gs))
    
if unique_only == True:
    print '\n', onc_gs['oncID'].count(), '/', len(onc_gs), 'sources non-ambiguously grouped'

# print onc_gs['oncflag'].value_counts()

# onc_gs

loading... 100.0%

In [16]:
onc_gs.to_csv(root + onc_out_name, index=False)

In [22]:
blah = onc_gs.loc[onc_gs['oncflag'] != '',['oncID','oncflag','catname','catID']]

blah.groupby('oncID').first()

print blah['oncflag'].value_counts()

d4    200
d2    154
d3    111
d5     55
d6     18
d8      8
Name: oncflag, dtype: int64
