In [3]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import astropy as ast
import pandas as pd
import sys

from astropy.coordinates import SkyCoord
from astropy.coordinates import ICRS, Galactic, FK4, FK5
from astropy.coordinates import Angle, Latitude, Longitude
import astropy.units as u

In [4]:
# progress meter for big loops
# note: progress must go from 0 to 100 because reasons

def progress_meter(progress):
    sys.stdout.write("\rloading... %.1f%%" % progress)
    sys.stdout.flush()

In [5]:
root = '/Users/alin/Box Sync/alin_17/'

### check contents of oncdb

In [6]:
# shows which catalogs are in oncdb and how many sources they have

# filename of db to be used
onc_curr_name = 'test_build.csv'

# ====

onc_curr = pd.read_csv(root + onc_curr_name)

print onc_curr['catname'].value_counts()

ACS      3399
WFPC2    1488
Name: catname, dtype: int64


### bash catalogs into right format

In [4]:
# catalogs only *need* to contain catID, _RAJ2000 (deg), _DEJ2000 (deg)
# repeat observations of a source (i.e. same catID) are compressed to a single entry by the groupby

# catalogs to be used, as a list of tuples -- ('catname', 'filename (csv)', 'catID column name')

cat_info = [('ACS', 'acs_full.csv', 'ONCacs'),\
            ('WFPC2', 'wfpc2_full.csv', 'ONCpc2'),\
            ('NICMOS', 'nicmos_full.csv', 'ONCnic3')]

# ====

cat_dict = dict()

for i in range(len(cat_info)):
    
    cat = pd.read_csv(root + cat_info[i][1])
    
    # use mean or median?
    cat_basic = cat[[cat_info[i][2],'_RAJ2000','_DEJ2000']].groupby(cat_info[i][2]).agg(lambda x: np.mean(x))
    
    cat_basic.insert(0,'catID', cat_basic.index)
    cat_basic.insert(0,'catname', cat_info[i][0])
    
    cat_dict[i] = cat_basic

new_cat = pd.concat(cat_dict, ignore_index=True)

# new_cat.insert(0,'oncID',np.nan)

print "new_cat has", len(new_cat), "sources"

new_cat has 7003 sources


### add new cat to db

In [26]:
# filename of existing db
onc_ex_name = 'test_build.csv'

# filename for updated db
onc_up_name = 'test_add.csv'

# ====

onc_ex = pd.read_csv(root + onc_ex_name)

print 'new_cat has', len(new_cat), 'entries'
print 'oncdb has', len(onc_ex), 'entries'

new_cat has 2116 entries
oncdb has 4887 entries


In [27]:
c_new = SkyCoord(new_cat['_RAJ2000'], new_cat['_DEJ2000'], unit='degree')
c_onc = SkyCoord(onc_ex['_RAJ2000'], onc_ex['_DEJ2000'], unit='degree')

cross_dist = pd.DataFrame()
self_dist = pd.DataFrame()

for k in range(len(c_new)):
    
    # sep between new_cat and existing oncdb sources
    sep_cross = c_onc.separation(c_new[k]).arcsecond
    
    # internal sep between new_cat sources
    sep_self = c_new.separation(c_new[k]).arcsecond
    
    cross_dist.loc[:,k] = sep_cross
    self_dist.loc[:,k] = sep_self
    
    progress_meter(k*100./len(c_new))

loading... 100.0%

In [29]:
# join existing dist df with sep_cross & sep_self dfs
# this looks (and probably is) stupid, but it works for now...

nc_join = new_cat.rename(index = lambda x: (x + len(onc_ex)), inplace=False)

cd_join = cross_dist.rename(columns = lambda x: (x + len(onc_ex)), inplace=False)
sd_join = self_dist.rename(columns = lambda x: (x + len(onc_ex)), inplace=False)

sd_join.rename(index = lambda x: (x + len(onc_ex)), inplace=True)

temp1 = onc_ex.join(cd_join)
temp2 = nc_join.join(cd_join.transpose().join(sd_join))

temp1.columns = temp1.columns.astype(str)
temp2.columns = temp2.columns.astype(str)

# print (temp1.columns == temp2.columns).sum()

onc_up = pd.concat([temp1,temp2], ignore_index=True)

onc_up

Unnamed: 0,catname,catID,_RAJ2000,_DEJ2000,0,1,2,3,4,5,...,6993,6994,6995,6996,6997,6998,6999,7000,7001,7002
0,ACS,1,83.541204,-5.373961,0.000000,34.394662,33.589484,107.434492,30.058728,140.638723,...,1548.110167,1564.876363,1567.185209,1573.046081,1578.051282,1579.182943,1582.550806,1584.180128,1589.888821,1596.038659
1,ACS,2,83.542708,-5.364525,34.394662,0.000000,63.484570,73.048407,27.702766,106.277861,...,1543.753960,1561.639150,1563.345905,1569.426675,1574.706649,1575.597966,1579.400487,1580.834103,1586.272160,1592.455190
2,ACS,3,83.546296,-5.381794,33.589484,63.484570,0.000000,133.978679,44.171191,166.397619,...,1529.570749,1545.419770,1548.215135,1553.895199,1558.676585,1560.002031,1563.018372,1564.804457,1570.729918,1576.850777
3,ACS,4,83.546446,-5.344578,107.434492,73.048407,133.978679,0.000000,90.744534,33.296242,...,1535.044643,1555.287555,1555.707879,1562.247931,1568.105348,1568.482902,1573.205947,1574.220229,1579.072889,1585.317142
4,ACS,5,83.548421,-5.369708,30.058728,27.702766,44.171191,90.744534,0.000000,122.646883,...,1522.626254,1539.932717,1541.948369,1547.915866,1553.055545,1554.069819,1557.651150,1559.184067,1564.760925,1570.927248
5,ACS,6,83.548842,-5.335642,140.638723,106.277861,166.397619,33.296242,122.646883,0.000000,...,1529.686259,1550.987938,1550.822211,1557.566687,1563.682849,1563.826216,1568.965044,1569.787680,1574.370415,1580.638194
6,ACS,7,83.550067,-5.411064,137.296071,169.603788,106.235235,239.701200,148.998435,271.554697,...,1519.591397,1531.863843,1536.531849,1541.498642,1545.399944,1547.471723,1549.116133,1551.506341,1558.253683,1564.245187
7,ACS,8,83.550104,-5.405411,117.627688,149.557538,86.109670,219.390907,128.672266,251.209126,...,1518.206970,1531.175751,1535.482711,1540.589338,1544.662597,1546.590479,1548.501515,1550.775352,1557.365823,1563.384499
8,ACS,9,83.550262,-5.370828,34.368934,35.326140,41.958803,95.484691,7.732875,126.771813,...,1515.915668,1533.097823,1535.179319,1541.122478,1546.232189,1547.272630,1550.806890,1552.360842,1557.967176,1564.129835
9,ACS,10,83.550942,-5.419003,165.864766,198.328722,134.983381,268.414146,177.691868,300.193973,...,1518.683475,1529.968231,1535.144670,1539.912438,1543.569814,1545.844126,1547.111745,1549.665445,1556.632280,1562.583684


In [30]:
onc_up.to_csv(root + onc_up_name, index=False)

### completely build db

In [5]:
# filename for new db
onc_build_name = 'test_mean.csv'

In [6]:
# generates df of all pairwise distances

# beware that this step takes a while -- e.g. 35 min with 7000 sources
# (can probably be parallelized, but I don't know how)
# consequently, advise against completely rebuilding db unless *really* necessary

c_new = SkyCoord(new_cat['_RAJ2000'], new_cat['_DEJ2000'], unit='degree')

build_dist = pd.DataFrame()

for k in range(len(c_new)):
    
    sep = c_new.separation(c_new[k]).arcsecond
    
    build_dist.loc[:,k] = sep
    
    progress_meter(k*100./len(c_new))

onc_build = pd.concat([new_cat, build_dist], axis=1)

onc_build

loading... 100.0%

Unnamed: 0,catname,catID,_RAJ2000,_DEJ2000,0,1,2,3,4,5,...,6993,6994,6995,6996,6997,6998,6999,7000,7001,7002
0,ACS,1,83.541204,-5.373961,0.000000,34.394662,33.589484,107.434492,30.058728,140.638723,...,1548.110167,1564.876363,1567.185209,1573.046081,1578.051282,1579.182943,1582.550806,1584.180128,1589.888821,1596.038659
1,ACS,2,83.542708,-5.364525,34.394662,0.000000,63.484570,73.048407,27.702766,106.277861,...,1543.753960,1561.639150,1563.345905,1569.426675,1574.706649,1575.597966,1579.400487,1580.834103,1586.272160,1592.455190
2,ACS,3,83.546296,-5.381794,33.589484,63.484570,0.000000,133.978679,44.171191,166.397619,...,1529.570749,1545.419770,1548.215135,1553.895199,1558.676585,1560.002031,1563.018372,1564.804457,1570.729918,1576.850777
3,ACS,4,83.546446,-5.344578,107.434492,73.048407,133.978679,0.000000,90.744534,33.296242,...,1535.044643,1555.287555,1555.707879,1562.247931,1568.105348,1568.482902,1573.205947,1574.220229,1579.072889,1585.317142
4,ACS,5,83.548421,-5.369708,30.058728,27.702766,44.171191,90.744534,0.000000,122.646883,...,1522.626254,1539.932717,1541.948369,1547.915866,1553.055545,1554.069819,1557.651150,1559.184067,1564.760925,1570.927248
5,ACS,6,83.548842,-5.335642,140.638723,106.277861,166.397619,33.296242,122.646883,0.000000,...,1529.686259,1550.987938,1550.822211,1557.566687,1563.682849,1563.826216,1568.965044,1569.787680,1574.370415,1580.638194
6,ACS,7,83.550067,-5.411064,137.296071,169.603788,106.235235,239.701200,148.998435,271.554697,...,1519.591397,1531.863843,1536.531849,1541.498642,1545.399944,1547.471723,1549.116133,1551.506341,1558.253683,1564.245187
7,ACS,8,83.550104,-5.405411,117.627688,149.557538,86.109670,219.390907,128.672266,251.209126,...,1518.206970,1531.175751,1535.482711,1540.589338,1544.662597,1546.590479,1548.501515,1550.775352,1557.365823,1563.384499
8,ACS,9,83.550262,-5.370828,34.368934,35.326140,41.958803,95.484691,7.732875,126.771813,...,1515.915668,1533.097823,1535.179319,1541.122478,1546.232189,1547.272630,1550.806890,1552.360842,1557.967176,1564.129835
9,ACS,10,83.550942,-5.419003,165.864766,198.328722,134.983381,268.414146,177.691868,300.193973,...,1518.683475,1529.968231,1535.144670,1539.912438,1543.569814,1545.844126,1547.111745,1549.665445,1556.632280,1562.583684


In [7]:
onc_build.to_csv(root + onc_build_name, index=False)

### group sources

In [7]:
# the db must be completely built (all catalogs added) before grouping sources

# filename of db to be used
onc_gs_name = 'test_gen.csv'

# filename for db with sources cross-matched and grouped
onc_out_name = 'onc_out.csv'

# distance for xmatch (arcsec)
dist_crit = 1.

# whether to restrict to unique, non-ambiguous matches
unique_only = True

# ====

onc_gs = pd.read_csv(root + onc_gs_name)

onc_gs.insert(0,'oncflag','')

onc_gs.insert(0,'oncID',np.nan)

num_cats = len(onc_gs['catname'].value_counts())

print 'oncdb has', len(onc_gs), 'sources in', num_cats, 'catalogs'

oncdb has 7003 sources in 3 catalogs


In [8]:
# new source numbering starts at highest ACS number + 1
new_source = max(onc_gs.loc[onc_gs['catname'] == 'ACS', 'catID'].values) + 1

exclude = []

for k in range(1085,1086): #len(onc_gs)):
    
    # find where dist < dist_crit
    m = onc_gs.loc[onc_gs[str(k)] < dist_crit]
    
    mindex = m[str(k)].index.tolist()
    
    mindex_updated = []
    
    # keep adding match values until no new values are added -- still in progress
    '''
    while mindex != mindex_updated:
        for x in mindex:
            y = onc_gs.loc[onc_gs[str(k)] < dist_crit]
            
            yindex = y[str(x)].index.tolist()
            
            print yindex
            
            mindex_updated.append(list(set(yindex)))
    '''
    
    match = onc_gs.loc[mindex,['catname','catID']]
    
    print match
    
    # check for multiple sources in same catalog (any duplicates will flag as True)
    if (match.duplicated(subset='catname',keep=False).any() == True) and (unique_only == True):
            
        onc_gs.loc[mindex,'oncflag'] += 'd'
            
        # skip assigning a number and continue on to the next source
        continue
    
    # use ACS number if it exists -- if multiple, use lowest
    if ('ACS' in match['catname'].values) == True:            
        onc_gs.loc[mindex,'oncID'] = min(match.loc[match['catname'] == 'ACS','catID'].values)
    
    # otherwise give it a new number
    else:
        onc_gs.loc[mindex,'oncID'] = new_source
        new_source += 1
    
    progress_meter(k*100./len(onc_gs))
    
if unique_only == True:
    print '\n', onc_gs['oncID'].count(), '/', len(onc_gs), 'sources non-ambiguously grouped'

# print onc_gs['oncflag'].value_counts()

# onc_gs

     catname  catID
1085     ACS   2470
1086     ACS   2472
4133   WFPC2    818
4134   WFPC2    819

0 / 7003 sources non-ambiguously grouped


In [40]:
onc_gs.to_csv(root + onc_out_name, index=False)