In [1]:
#Miles' packages
from astropy.table import Table
import astropy.coordinates as coord
import glob
from astropy import units as u

#data processing
import pandas as pd
import numpy as np
import math as math
from tqdm import tqdm
import collections

#visualizations
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('Agg')
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

#models
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

#data manipulation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

#for my sanity
import warnings
warnings.filterwarnings('ignore')

In [2]:
from astroquery.gaia import Gaia

Created TAP+ (v1.2.1) - Connection:
	Host: gea.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443
Created TAP+ (v1.2.1) - Connection:
	Host: geadata.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443


In [3]:
stream_data_base = 'gaia_mock_streams/'
stream_files = glob.glob(stream_data_base + 'stream*.fits.gz')

We now have all the stream files:

In [4]:
print(len(stream_files))
stream_files[:5]

322


['gaia_mock_streams/stream-178.fits.gz',
 'gaia_mock_streams/stream-9528.fits.gz',
 'gaia_mock_streams/stream-9549.fits.gz',
 'gaia_mock_streams/stream-1954.fits.gz',
 'gaia_mock_streams/stream-2408.fits.gz']

#### Let us generate a CSV with new ra_min, ra_max, dec_min, and dec_max

In [28]:
stream_name = []
ra_min = []
ra_max = []
dec_min = []
dec_max = []

for stream_n in tqdm(stream_files[20:100]):
    table = Table.read(stream_n, format='fits')
    stream = table.to_pandas()
    
    stream_name.append(stream_n)
    ra_min.append(min(stream.ra))
    ra_max.append(max(stream.ra))
    dec_min.append(min(stream.dec))
    dec_max.append(max(stream.dec))

100%|██████████| 80/80 [00:02<00:00, 27.93it/s]


In [38]:
cut_untouched = pd.DataFrame()
cut_untouched['stream_file']= stream_name
cut_untouched['ra_min']= ra_min
cut_untouched['ra_max']= ra_max
cut_untouched['dec_min']= dec_min
cut_untouched['dec_max']= dec_max

cut_untouched.to_csv("cuts/cuts3.csv", index = False)

In [39]:
#relevant cut for each stream
#cuts1 = pd.read_csv("cuts/cuts.csv")
#cuts2 = pd.read_csv("cuts/cuts2.csv")
#cuts = cuts1.append(cuts2, ignore_index = True, sort = True)

#untouched cuts (validation / performance reporting)
cuts = pd.read_csv("cuts/cuts3.csv")

In [40]:
cuts.head()

Unnamed: 0,stream_file,ra_min,ra_max,dec_min,dec_max
0,gaia_mock_streams/stream-6739.fits.gz,5.006626,353.706544,-72.726224,62.99993
1,gaia_mock_streams/stream-4624.fits.gz,3.415784,342.776381,-81.250596,88.556886
2,gaia_mock_streams/stream-5698.fits.gz,7.388297,359.337578,-83.504925,82.960142
3,gaia_mock_streams/stream-2985.fits.gz,152.502007,315.609415,-75.067902,81.916498
4,gaia_mock_streams/stream-847.fits.gz,0.225259,359.99168,-41.113838,78.557753


#### let us write a function to extract the Gaia noise points based on some inputs

In [41]:
def obtain_noise(min_ra, max_ra, min_dec, max_dec, max_rel_err, n_points):
    
    qry = f" \n\
    select top {n_points} source_id, \n\
    dr2.ra, \n\
    dr2.dec, \n\
    parallax, \n\
    parallax_error, \n\
    pmra, \n\
    pmdec, \n\
    phot_g_mean_mag,\n\
    phot_bp_mean_mag, \n\
    phot_rp_mean_mag, \n\
    bp_rp, \n\
    bp_g, \n\
    g_rp\n\
    from gaiadr2.gaia_source as dr2 \n\
    where dr2.ra > {min_ra} and dr2.ra < {max_ra} and dr2.dec > {min_dec} and dr2.dec < {max_dec} \n\
    and parallax is not null \n\
    and parallax_error is not null \n\
    and abs(dr2.parallax/dr2.parallax_error) < {max_rel_err} \n\
    and pmra is not null \n\
    and pmdec is not null \n\
    and phot_g_mean_mag is not null \n\
    and phot_bp_mean_mag is not null \n\
    and phot_rp_mean_mag is not null \n\
    and bp_rp is not null \n\
    and bp_g is not null \n\
    and g_rp is not null \n\
    order by random_index"

    data_noise = Gaia.launch_job_async(qry).get_results().to_pandas()
    
    return data_noise

#### For each mock stream, we wish to obtain a fixed test set that represents the realistic ratio of non-stream stars to stream stars that we expect after applying an isochrone filter.
#### - This ratio was previously determined to be ~400

In [48]:
def obtain_test_set(list_of_stellar_streams, multiple):

    for i in tqdm(list_of_stellar_streams):
        table = Table.read(i, format='fits')
        stream = table.to_pandas()
        
        idx = cuts.index[cuts.stream_file==str(i)][0]

        ra_min = cuts.loc[idx].ra_min
        ra_max = cuts.loc[idx].ra_max
        dec_min = cuts.loc[idx].dec_min
        dec_max = cuts.loc[idx].dec_max
        
        #restrict stream to relevant portion
        stream = stream.query('ra > ' + str(ra_min) + ' & ra < ' + str(ra_max) + ' & dec > ' + str(dec_min) + ' & dec < ' + str(dec_max))

        #obtain noise points such that:
        #- the ratio of stream to noise points in the test set is 1:multiple
        #- this is required because KNN introduces bias when the ratios are imbalanced
        n_points = len(stream) * multiple
        #use max_rel_err of 0.5
        max_rel_err =  0.5

        #we now select our noise points that we will incorporate into our training and test set
        noise_points = obtain_noise(ra_min, ra_max, dec_min, dec_max, max_rel_err, n_points)

        #label our data as "not part of the stream"
        noise_points['stream_mask'] = False
        
        #send to csv
        name = i[18:-8]+'_mul_400_total_noise.csv'
        noise_points.to_csv(name)

    return noise_points

In [47]:
viable_streams = []

for i in cuts.stream_file:
    table = Table.read(i, format='fits')
    stream = table.to_pandas()
    
    idx = cuts.index[cuts.stream_file==str(i)][0]

    ra_min = cuts.loc[idx].ra_min
    ra_max = cuts.loc[idx].ra_max
    dec_min = cuts.loc[idx].dec_min
    dec_max = cuts.loc[idx].dec_max

    #restrict stream to relevant portion
    stream2 = stream.query('ra > ' + str(ra_min) + ' & ra < ' + str(ra_max) + ' & dec > ' + str(dec_min) + ' & dec < ' + str(dec_max))
    
    if len(stream2) < 500:
        viable_streams.append(i)
        
print('we are dealing with ' + str(len(viable_streams)) + ' streams with n_stars < 500')

we are dealing with 20 streams with n_stars < 500


##### Note that stream number 2985 shows up in the first set of 10 stars so we're only really dealing with 19 stars

In [49]:
####ONLY NEED TO RUN THIS ONCE
test2 = obtain_test_set(viable_streams,400)

  0%|          | 0/20 [00:00<?, ?it/s]

INFO: Query finished. [astroquery.utils.tap.core]


  5%|▌         | 1/20 [01:06<21:06, 66.63s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 10%|█         | 2/20 [01:44<17:22, 57.93s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 15%|█▌        | 3/20 [02:25<15:00, 52.95s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 20%|██        | 4/20 [03:36<15:32, 58.29s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 25%|██▌       | 5/20 [04:27<14:00, 56.02s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 30%|███       | 6/20 [04:51<10:50, 46.45s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 35%|███▌      | 7/20 [07:13<16:17, 75.17s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 40%|████      | 8/20 [07:28<11:24, 57.03s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 45%|████▌     | 9/20 [08:03<09:14, 50.42s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 50%|█████     | 10/20 [09:12<09:19, 56.00s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 55%|█████▌    | 11/20 [10:30<09:24, 62.68s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 65%|██████▌   | 13/20 [11:46<05:29, 47.10s/it]

INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


 70%|███████   | 14/20 [12:22<04:21, 43.60s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 75%|███████▌  | 15/20 [12:39<02:58, 35.67s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 85%|████████▌ | 17/20 [19:50<05:39, 113.03s/it]

INFO: Query finished. [astroquery.utils.tap.core]


 90%|█████████ | 18/20 [20:06<02:47, 83.94s/it] 

INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


 95%|█████████▌| 19/20 [20:21<01:03, 63.14s/it]

INFO: Query finished. [astroquery.utils.tap.core]


100%|██████████| 20/20 [20:28<00:00, 61.44s/it]
