In [1]:
%matplotlib inline
%env PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/home/ubuntu/im-photoz/Montage_v3.3/bin:/montage/bin

env: PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/home/ubuntu/im-photoz/Montage_v3.3/bin:/montage/bin


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import shutil
import requests
import json
import bz2
import re
import subprocess
import math
from time import sleep

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import montage_wrapper as mw
from astropy.io import fits
from astropy import wcs

import warnings
warnings.filterwarnings("ignore")

In [3]:
# SQL to get data
def sample_one_sq_deg(ra, dec, n=500, dr="dr12", timeout=60):
    
    url = "http://skyserver.sdss.org/{}/en/tools/search/x_sql.aspx".format(dr)
    payload = {
        "format": "json",
        "cmd": """
            SELECT TOP {0} spec.specObjID, phot.objID,
                spec.ra, spec.dec,
                spec.class,
                spec.z, spec.zErr,
                phot.rerun, phot.run, phot.camcol, phot.field,
                phot.dered_u, phot.dered_g, phot.dered_r, phot.dered_i, phot.dered_z,
                phot.psfMag_u, phot.psfMag_g, phot.psfMag_r, phot.psfMag_i, phot.psfMag_z,
                phot.extinction_u, phot.extinction_g, phot.extinction_r, phot.extinction_i, phot.extinction_z
            FROM SpecObjAll AS spec
            JOIN PhotoObjAll AS phot
            ON spec.specObjID = phot.specObjID
            WHERE
                phot.clean = 1
                AND spec.zWarning = 0
                AND spec.ra >= {1}
                AND spec.ra < {2}
                AND spec.dec >= {3}
                AND spec.dec < {4}
                AND phot.dered_r > 0 and phot.dered_r < 40
                AND phot.expRad_r < 30
                AND phot.deVRad_r < 30
                AND spec.zErr < 0.1
                AND spec.z < 2
            ORDER BY NEWID()
        """.format(n, ra, ra + 1, dec, dec + 1).strip()
    }
    
    try:
        resp = requests.post(url, params=payload, timeout=timeout)
    except requests.exceptions.RequestException as e:
        print(e)
        return None

    data = resp.json()[0]['Rows']
    
    df = pd.DataFrame(data)
    
    df[["specObjID", "objID"]] = df[["specObjID", "objID"]].astype("object")

    return df

In [4]:
df = sample_one_sq_deg(170, 0)

df.to_csv("objects.csv", index=False)

print(df.head(10))
print('\n')
print(df['z'].head(10))

   camcol   class       dec   dered_g   dered_i   dered_r   dered_u   dered_z  \
0       4  GALAXY  0.172603  15.20676  14.27196  14.60115  16.65101  14.01617   
1       6  GALAXY  0.864346  21.46536  19.16658  20.20228  21.42334  18.62079   
2       6  GALAXY  0.954979  18.07721  16.71774  17.12099  19.94620  16.37535   
3       4  GALAXY  0.007604  21.14548  18.60237  19.41115  23.34092  18.26327   
4       4    STAR  0.000393  19.40408  18.96151  19.10806  20.36119  18.92983   
5       5  GALAXY  0.639260  18.24868  17.27957  17.65503  19.38414  17.07379   
6       5  GALAXY  0.760034  18.04411  16.70178  17.10160  19.99410  16.36309   
7       6  GALAXY  0.884404  16.42347  15.25037  15.63030  18.06894  14.93445   
8       5  GALAXY  0.703936  22.45325  19.76458  20.77152  25.20980  19.22251   
9       5  GALAXY  0.768809  22.21378  20.13114  20.94429  23.16538  19.89108   

   extinction_g  extinction_i  extinction_r  extinction_u  extinction_z  \
0      0.140125      0.077063    

In [5]:
# Download images
def fetch_fits(df, dirname="temp"):

    bands = [c for c in 'ugriz']

    if not os.path.exists(dirname):
        os.makedirs(dirname)

    for i, r in df.iterrows():

        url = "http://data.sdss3.org/sas/dr12/boss/photoObj/frames/{0}/{1}/{2}/".format(
            r["rerun"], r["run"], r["camcol"], r["field"])

        print("Downloading rerun: {}, run: {}, camcol: {}, field:{}".format(
            r["rerun"], r["run"], r["camcol"], r["field"]))
    
        for band in bands:

            filename = "frame-{4}-{1:06d}-{2}-{3:04d}.fits".format(
                r["rerun"], r["run"], r["camcol"], r["field"], band)
            filepath = os.path.join(dirname, filename)
            
            if os.path.exists(filepath):
                continue

            for _ in range(10):
                try:
                    resp = requests.get(url + filename + ".bz2")
                except:
                    sleep(1)
                    continue
                
                if resp.status_code == 200:
                    with open(filepath, "wb") as f:
                        img = bz2.decompress(resp.content)
                        f.write(img)
                    break
                else:
                    sleep(1)
                    continue

            if not os.path.exists(filepath):
                raise Exception

In [6]:
fetch_fits(df)

Downloading rerun: 301, run: 6793, camcol: 4, field:64
Downloading rerun: 301, run: 6793, camcol: 6, field:63
Downloading rerun: 301, run: 6793, camcol: 6, field:66
Downloading rerun: 301, run: 6793, camcol: 4, field:65
Downloading rerun: 301, run: 6793, camcol: 4, field:66
Downloading rerun: 301, run: 756, camcol: 5, field:363
Downloading rerun: 301, run: 756, camcol: 5, field:364
Downloading rerun: 301, run: 6793, camcol: 6, field:67
Downloading rerun: 301, run: 756, camcol: 5, field:364
Downloading rerun: 301, run: 756, camcol: 5, field:363
Downloading rerun: 301, run: 6793, camcol: 4, field:65
Downloading rerun: 301, run: 756, camcol: 4, field:367
Downloading rerun: 301, run: 756, camcol: 4, field:364
Downloading rerun: 301, run: 6793, camcol: 4, field:67
Downloading rerun: 301, run: 756, camcol: 5, field:367
Downloading rerun: 301, run: 6793, camcol: 5, field:68
Downloading rerun: 301, run: 6793, camcol: 4, field:69
Downloading rerun: 301, run: 6793, camcol: 5, field:65
Downloadin

In [7]:
def get_ref_list(df):

    ref_images = []
    
    for row in df.iterrows():
        r = row[1]
        filename = "frame-r-{1:06d}-{2}-{3:04d}.fits".format(r["rerun"], r["run"], r["camcol"], r["field"])
        ref_images.append(filename)

    return ref_images

ref_images = get_ref_list(df)

## Montage

In [8]:
# Montage: align images
def align_images(images, frame_dir="temp", registered_dir="temp"):

    if not os.path.exists(registered_dir):
        os.makedirs(registered_dir)
    
    for image in images:

        registered_path = [
            os.path.join(registered_dir, image.replace("frame-r-", "registered-{}-").format(b))
            for b in "ugriz"
            ]
        
        if all([os.path.exists(r) for r in registered_path]):
            print("Skipping {}...".format(image))
            continue
        else:
            print("Processing {}...".format(image))
    
        frame_path = [
            os.path.join(frame_dir, image.replace("frame-r-", "frame-{}-").format(b))
            for b in "ugriz"
            ]

        header = os.path.join(
            registered_dir,
            image.replace("frame", "header").replace(".fits", ".hdr")
            )

        mw.commands.mGetHdr(os.path.join(frame_dir, image), header)
        mw.reproject(
            frame_path, registered_path,
            header=header, exact_size=True, silent_cleanup=True, common=True
            )

    return None

In [9]:
align_images(ref_images)

Skipping frame-r-006793-4-0064.fits...
Skipping frame-r-006793-6-0063.fits...
Skipping frame-r-006793-6-0066.fits...
Skipping frame-r-006793-4-0065.fits...
Skipping frame-r-006793-4-0066.fits...
Skipping frame-r-000756-5-0363.fits...
Skipping frame-r-000756-5-0364.fits...
Skipping frame-r-006793-6-0067.fits...
Skipping frame-r-000756-5-0364.fits...
Skipping frame-r-000756-5-0363.fits...
Skipping frame-r-006793-4-0065.fits...
Skipping frame-r-000756-4-0367.fits...
Skipping frame-r-000756-4-0364.fits...
Skipping frame-r-006793-4-0067.fits...
Skipping frame-r-000756-5-0367.fits...
Skipping frame-r-006793-5-0068.fits...
Skipping frame-r-006793-4-0069.fits...
Skipping frame-r-006793-5-0065.fits...
Skipping frame-r-000756-4-0363.fits...
Skipping frame-r-000752-4-0181.fits...
Skipping frame-r-000756-4-0364.fits...
Skipping frame-r-006793-5-0068.fits...
Skipping frame-r-006793-6-0066.fits...
Skipping frame-r-006793-6-0066.fits...
Skipping frame-r-006793-6-0068.fits...
Skipping frame-r-002141-4

In [10]:
# Convert ra and dec values to pixel positions
def convert_catalog_to_pixels(df, dirname="temp"):

    if not os.path.exists(dirname):
        os.makedirs(dirname)

    pixels = []
    fits_list = []

    for i, r in df.iterrows():

        fits_file = "registered-r-{1:06d}-{2}-{3:04d}.fits".format(
            r["rerun"], r["run"], r["camcol"], r["field"])
        fits_path = os.path.join(dirname, fits_file)
            
        hdulist = fits.open(fits_path)

        w = wcs.WCS(hdulist[0].header, relax=False)
        
        px, py = w.all_world2pix(r["ra"], r["dec"], 1)

        fits_list.append(fits_file)
        pixels.append((i, px, py, r["class"]))

    for i, fits_file in enumerate(fits_list):
        ix, px, py, c = pixels[i]
        pixel_list = fits_file.replace(".fits", ".list")
        pixel_path = os.path.join(dirname, pixel_list)
        with open(pixel_path, "a") as fout:
            fout.write("{} {} {} {}\n".format(ix, px, py, c))
            
    df_pixels = pd.DataFrame(pixels, columns=['id', 'object_x_coord', 'object_y_coord', 'class']).drop('id', axis=1)

    return df_pixels

In [11]:
df_pixels = convert_catalog_to_pixels(df)
print(df_pixels.head(10))

df['object_x_coord'] = df_pixels['object_x_coord']
df['object_y_coord'] = df_pixels['object_y_coord']
print(df.head(10))
# print(df.columns)

  object_x_coord object_y_coord   class
0  1637.84336843  1068.85220149  GALAXY
1  294.872863624  1408.21539868  GALAXY
2   1118.6070358  93.5356738403  GALAXY
3   137.83215705  290.609046099  GALAXY
4  72.5019690916  647.842146132    STAR
5  160.229402246  988.975365263  GALAXY
6  1258.01572929  862.399276906  GALAXY
7  477.346392694  1133.80451308  GALAXY
8  748.131142603  157.338773323  GALAXY
9  1337.72553646  941.518805298  GALAXY

[10 rows x 3 columns]
   camcol   class       dec   dered_g   dered_i   dered_r   dered_u   dered_z  \
0       4  GALAXY  0.172603  15.20676  14.27196  14.60115  16.65101  14.01617   
1       6  GALAXY  0.864346  21.46536  19.16658  20.20228  21.42334  18.62079   
2       6  GALAXY  0.954979  18.07721  16.71774  17.12099  19.94620  16.37535   
3       4  GALAXY  0.007604  21.14548  18.60237  19.41115  23.34092  18.26327   
4       4    STAR  0.000393  19.40408  18.96151  19.10806  20.36119  18.92983   
5       5  GALAXY  0.639260  18.24868  17.27957  17

## Sextractor

In [12]:
%%writefile default.param
XMIN_IMAGE               Minimum x-coordinate among detected pixels                [pixel]
YMIN_IMAGE               Minimum y-coordinate among detected pixels                [pixel]
XMAX_IMAGE               Maximum x-coordinate among detected pixels                [pixel]
YMAX_IMAGE               Maximum y-coordinate among detected pixels                [pixel]
VECTOR_ASSOC(1)          #ASSOCiated parameter vector

Overwriting default.param


In [13]:
%%writefile default.sex
#-------------------------------- Catalog ------------------------------------
 
CATALOG_NAME     test.cat       # name of the output catalog
CATALOG_TYPE     ASCII_HEAD     # NONE,ASCII,ASCII_HEAD, ASCII_SKYCAT,
                                # ASCII_VOTABLE, FITS_1.0 or FITS_LDAC
PARAMETERS_NAME  default.param  # name of the file containing catalog contents
 
#------------------------------- Extraction ----------------------------------
 
DETECT_TYPE      CCD            # CCD (linear) or PHOTO (with gamma correction)
DETECT_MINAREA   3              # min. # of pixels above threshold
DETECT_THRESH    1.5            # <sigmas> or <threshold>,<ZP> in mag.arcsec-2
ANALYSIS_THRESH  1.5            # <sigmas> or <threshold>,<ZP> in mag.arcsec-2
 
FILTER           Y              # apply filter for detection (Y or N)?
FILTER_NAME      default.conv   # name of the file containing the filter
 
DEBLEND_NTHRESH  32             # Number of deblending sub-thresholds
DEBLEND_MINCONT  0.005          # Minimum contrast parameter for deblending
 
CLEAN            Y              # Clean spurious detections? (Y or N)?
CLEAN_PARAM      1.0            # Cleaning efficiency
 
MASK_TYPE        CORRECT        # type of detection MASKing: can be one of
                                # NONE, BLANK or CORRECT

#------------------------------ Photometry -----------------------------------
 
PHOT_APERTURES   5              # MAG_APER aperture diameter(s) in pixels
PHOT_AUTOPARAMS  2.5, 3.5       # MAG_AUTO parameters: <Kron_fact>,<min_radius>
PHOT_PETROPARAMS 2.0, 3.5       # MAG_PETRO parameters: <Petrosian_fact>,
                                # <min_radius>

SATUR_LEVEL      50000.0        # level (in ADUs) at which arises saturation
SATUR_KEY        SATURATE       # keyword for saturation level (in ADUs)
 
MAG_ZEROPOINT    0.0            # magnitude zero-point
MAG_GAMMA        4.0            # gamma of emulsion (for photographic scans)
GAIN             0.0            # detector gain in e-/ADU
GAIN_KEY         GAIN           # keyword for detector gain in e-/ADU
PIXEL_SCALE      1.0            # size of pixel in arcsec (0=use FITS WCS info)
 
#------------------------- Star/Galaxy Separation ----------------------------
 
SEEING_FWHM      1.2            # stellar FWHM in arcsec
STARNNW_NAME     default.nnw    # Neural-Network_Weight table filename
 
#------------------------------ Background -----------------------------------
 
BACK_SIZE        64             # Background mesh: <size> or <width>,<height>
BACK_FILTERSIZE  3              # Background filter: <size> or <width>,<height>
 
BACKPHOTO_TYPE   GLOBAL         # can be GLOBAL or LOCAL
 
#------------------------------ Check Image ----------------------------------
 
CHECKIMAGE_TYPE  SEGMENTATION   # can be NONE, BACKGROUND, BACKGROUND_RMS,
                                # MINIBACKGROUND, MINIBACK_RMS, -BACKGROUND,
                                # FILTERED, OBJECTS, -OBJECTS, SEGMENTATION,
                                # or APERTURES
CHECKIMAGE_NAME  check.fits     # Filename for the check-image
 
#--------------------- Memory (change with caution!) -------------------------
 
MEMORY_OBJSTACK  3000           # number of objects in stack
MEMORY_PIXSTACK  300000         # number of pixels in stack
MEMORY_BUFSIZE   1024           # number of lines in buffer
 
#----------------------------- Miscellaneous ---------------------------------
 
VERBOSE_TYPE     NORMAL         # can be QUIET, NORMAL or FULL
HEADER_SUFFIX    .head          # Filename extension for additional headers
WRITE_XML        N              # Write XML file (Y/N)?
XML_NAME         sex.xml        # Filename for XML output

#----------------------------- ASSOC parameters ---------------------------------

ASSOC_NAME       sky.list       # name of the ASCII file to ASSOCiate, the expected pixel 
                                # coordinates list given as [id, xpos, ypos]
ASSOC_DATA       1              # columns of the data to replicate (0=all), replicate id
                                # of the object in the SExtractor output file
ASSOC_PARAMS     2,3            # columns of xpos,ypos[,mag] in the expected pixel
                                # coordinates list
ASSOC_RADIUS     2.0            # cross-matching radius (pixels)
ASSOC_TYPE       NEAREST        # ASSOCiation method: FIRST, NEAREST, MEAN,
                                # MAG_MEAN, SUM, MAG_SUM, MIN or MAX
ASSOCSELEC_TYPE  MATCHED        # ASSOC selection type: ALL, MATCHED or -MATCHED

Overwriting default.sex


## My modification

In [14]:
def run_sex(df, filename, dirname="temp"): # Run sextractor on one file
    
    fpath = os.path.join(dirname, filename)
        
    list_file = filename.replace(".fits", ".list")
    list_path = os.path.join(dirname, list_file)

    config_file = filename.replace(".fits", ".sex")

    with open("default.sex", "r") as default:
        with open(config_file, "w") as temp:
            for line in default:
                line = re.sub(
                    r"^ASSOC_NAME\s+sky.list",
                    "ASSOC_NAME       {}".format(list_file),
                    line
                )
                temp.write(line)
    
    shutil.copy(list_path, os.getcwd())
    
    subprocess.call(["sex", "-c", config_file, fpath])

    os.remove(config_file)
    os.remove(os.path.join(os.getcwd(), list_file))
    
    return None

In [15]:
registered_all = [f.replace("frame-", "registered-") for f in ref_images] # 331 objects
registered_all_unique = np.unique(registered_all) # 38 unique objects

In [16]:
# Construct the new DataFrame with pixels of the stellar objects.
cat = pd.DataFrame()
pixels = pd.DataFrame()

copy = registered_all
# registered_all = registered_all[:10]

remaining = len(registered_all)
print ('Total number of images to pass = {num}'.format(num=remaining))
for f in registered_all:
    run_sex(df, f)
    
    # Get cat
    try:
        assoc = pd.read_csv(
            "test.cat",
            skiprows=5,
            sep="\s+",
            names=["xmin", "ymin", "xmax", "ymax", "match"]
        )
        assoc["file"] = f
        cat = cat.append(assoc)
    except:
        pass
    
    # Get pixels, use r band as a reference
    mask = fits.open('check.fits')  
    m = mask[0].data
    image = fits.open('temp/'+f)
    img = image[0].data
    
    # Get flux from all bands (ugriz)
    flux = {}
    flux_err = {}
    bands = ['u', 'g', 'r', 'i', 'z']
    for b in bands:
        path = f.replace("-r-", "-{}-".format(b))
        image = fits.open('temp/'+path)
        flux[b] = image[0].data
    f_u, f_g, f_r, f_i, f_z = flux.values()
    
    # Leave pixels that belong to objects only
    img[m == 0] = 99 # Background
    for objnum in np.unique(m):
        if objnum != 0:

            px, py = np.where(m == objnum)
                    
            u = f_u[m == objnum]
            g = f_g[m == objnum]
            r = f_r[m == objnum]
            i = f_i[m == objnum]
            z = f_z[m == objnum]

            mat = assoc.loc[objnum-1]['match']
            match = np.full(len(px), mat)
            pix = pd.DataFrame({'match': match, 'pixel_x_coord': px, 'pixel_y_coord': py,
                                'flux_u': u, 'flux_g': g, 'flux_r':r, 'flux_i':i, 'flux_z':z,
                                'flux_err_u': u, 'flux_err_g': g, 'flux_err_r':r, 'flux_err_i':i, 'flux_err_z':z})
            pixels = pixels.append(pix)
            
    remaining -= 1
    print ('Remaining number of images = {num}'.format(num=remaining))
    
print ('The job is finished.')

Total number of images to pass = 331
Remaining number of images = 330
Remaining number of images = 329
Remaining number of images = 328
Remaining number of images = 327
Remaining number of images = 326
Remaining number of images = 325
Remaining number of images = 324
Remaining number of images = 323
Remaining number of images = 322
Remaining number of images = 321
Remaining number of images = 320
Remaining number of images = 319
Remaining number of images = 318
Remaining number of images = 317
Remaining number of images = 316
Remaining number of images = 315
Remaining number of images = 314
Remaining number of images = 313
Remaining number of images = 312
Remaining number of images = 311
Remaining number of images = 310
Remaining number of images = 309
Remaining number of images = 308
Remaining number of images = 307
Remaining number of images = 306
Remaining number of images = 305
Remaining number of images = 304
Remaining number of images = 303
Remaining number of images = 302
Remain

In [21]:
if len(cat) > 0:
    cat["class"] = df.ix[cat["match"], "class"].values
    cat["objID"] = df.ix[cat["match"], "objID"].values
if len(pixels) > 0:
    pixels["objID"] = df.ix[pixels["match"], "objID"].values
    pixels["class"] = df.ix[pixels["match"], "class"].values 
    pixels['object_x_coord'] = df.ix[pixels["match"], 'object_x_coord'].values.astype(float)
    pixels['object_y_coord'] = df.ix[pixels["match"], 'object_y_coord'].values.astype(float)
    pixels["z"] = df.ix[pixels["match"], "z"].values
    pixels["zErr"] = df.ix[pixels["match"], "zErr"].values
    pixels["extinction_u"] = df.ix[pixels["match"],"extinction_u"].values
    pixels["extinction_g"] = df.ix[pixels["match"],"extinction_g"].values
    pixels["extinction_r"] = df.ix[pixels["match"],"extinction_r"].values
    pixels["extinction_i"] = df.ix[pixels["match"],"extinction_i"].values
    pixels["extinction_z"] = df.ix[pixels["match"],"extinction_z"].values
    if not "pixelID" in pixels.columns:
        pixels.insert(loc=0, value = np.arange(0, pixels['pixel_x_coord'].count()), column = "pixelID")

In [22]:
# Calculate distance
if len(pixels) > 0:
    x_pixels = np.array(pixels.pixel_x_coord)
    x_obj_pixels = np.array(pixels.object_x_coord)
    y_pixels = np.array(pixels.pixel_y_coord)
    y_obj_pixels = np.array(pixels.object_y_coord)
    pixels["distance_to_obj"] = ((x_pixels-x_obj_pixels)**2+(y_pixels-y_obj_pixels)**2)**0.5

In [23]:
print(pixels.head(10))

   pixelID  flux_err_g  flux_err_i  flux_err_r  flux_err_u  flux_err_z  \
0        0    0.051270    0.208025   -0.007340    0.101352    0.008824   
1        1    0.023193    0.192137    0.021358    0.117814    0.029060   
2        2    0.056946    0.129262    0.064258    0.111672    0.022452   
3        3    0.056946    0.079426    0.078783    0.089046    0.003547   
4        4    0.079346    0.203520    0.009866    0.048217    0.041955   
5        5    0.056946    0.236881   -0.012584    0.051214    0.051514   
6        6    0.062500    0.178337   -0.001029    0.111323    0.037046   
7        7    0.068115   -0.023973    0.112239    0.112541    0.015519   
8        8    0.068115    0.126306    0.058169    0.047494   -0.004199   
9        9    0.096313   -0.016586    0.017627    0.088577    0.020474   

     flux_g    flux_i    flux_r    flux_u    flux_z  match  pixel_x_coord  \
0  0.051270  0.208025 -0.007340  0.101352  0.008824    216            184   
1  0.023193  0.192137  0.021358

In [24]:
print(pixels.tail(10))

     pixelID  flux_err_g  flux_err_i  flux_err_r  flux_err_u  flux_err_z  \
631  1824814    0.049316   -0.030100   -0.000253    0.082915    0.012117   
632  1824815    0.049316    0.000758    0.034427    0.096775    0.022212   
633  1824816    0.063477    0.042393    0.020258    0.038932    0.020912   
634  1824817    0.058777    0.104269   -0.043277    0.085621    0.015460   
635  1824818    0.054016    0.062693   -0.024155    0.017647    0.023681   
636  1824819    0.049316    0.008622   -0.011305    0.037382    0.016931   
637  1824820    0.011459    0.207574   -0.005031    0.067494    0.023877   
638  1824821    0.058716    0.134437    0.039771    0.031831    0.019758   
639  1824822    0.068237    0.112534    0.026001    0.109368    0.022002   
640  1824823    0.082397    0.043079   -0.008515    0.085891    0.015009   

       flux_g    flux_i    flux_r    flux_u    flux_z  match  pixel_x_coord  \
631  0.049316 -0.030100 -0.000253  0.082915  0.012117     19           1063   
632  

In [25]:
print(pixels.distance_to_obj.describe())

count    1824824.000000
mean         818.877879
std          385.511484
min            8.132199
25%          577.749901
50%          805.334014
75%         1042.879143
max         2117.380295
Name: distance_to_obj, dtype: float64


In [26]:
def nanomaggie_to_luptitude(array, band):
    '''
    Converts nanomaggies (flux) to luptitudes (magnitude).

    http://www.sdss.org/dr12/algorithms/magnitudes/#asinh
    http://arxiv.org/abs/astro-ph/9903081
    '''
    b = {
        'u': 1.4e-10,
        'g': 0.9e-10,
        'r': 1.2e-10,
        'i': 1.8e-10,
        'z': 7.4e-10
    }
    nanomaggie = array * 1.0e-9 # fluxes are in nanomaggies

    luptitude = -2.5 / np.log(10) * (np.arcsinh((nanomaggie / (2 * b[band]))) + np.log(b[band]))
    
    return luptitude

In [27]:
pixels["lup_u"] = nanomaggie_to_luptitude(np.array(pixels.flux_u), 'u')
pixels["lup_g"] = nanomaggie_to_luptitude(np.array(pixels.flux_g), 'g')
pixels["lup_r"] = nanomaggie_to_luptitude(np.array(pixels.flux_r), 'r')
pixels["lup_i"] = nanomaggie_to_luptitude(np.array(pixels.flux_i), 'i')
pixels["lup_z"] = nanomaggie_to_luptitude(np.array(pixels.flux_z), 'z')

##  kNN

In [28]:
from sklearn import neighbors
from sklearn.cross_validation import train_test_split
from sklearn.utils import check_random_state

In [29]:
# Copy selected columns
pixels_copy = pixels[['objID', 'pixelID', 'z', 'zErr', 'lup_u', 'lup_g', 'lup_r', 'lup_i', 'lup_z']]
df_copy = df[['objID', 'z', 'zErr', 'class', 'object_x_coord', 'object_y_coord']]

In [30]:
# Split training and testing sets by each OBJECT not each pixel
def split(df=df_copy, pixels=pixels_copy, test_column=['z', 'zErr'], test_size=0.2, random_state=check_random_state(0), \
          near_center=False, pixel_distance_to_center=10):
    
    # Need revising later
    '''if near_center==True:
        rows_to_keep = pd.DataFrame()
        for i, r in pixels.iterrows():
            distance = ( ( float(r['pixel_x_coord']) - float(r['object_x_coord']) )**2
                        + ( float(r['pixel_y_coord']) - float(r['object_y_coord']) )**2 )**0.5
            if distance > pixel_distance_to_center:
                rows_to_keep = rows_to_keep.append(r)
        pixels_copy = rows_to_keep  '''   
    
    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
                                                        df.drop(test_column, axis=1), 
                                                        df[test_column], 
                                                        test_size=test_size, 
                                                        random_state=random_state
                                                        )
    X_train = pd.DataFrame()
    X_test = pd.DataFrame()
    y_train = pd.DataFrame()
    y_test = pd.DataFrame()
    
    pc_test_col = pixels[[test_column[0], test_column[1], 'objID']]
    pc_no_test_col = pixels.drop(test_column, axis=1)
    
    for objID in df_X_train['objID']:
          X_train = X_train.append(pc_no_test_col[pc_no_test_col['objID']==objID])
          y_train = y_train.append(pc_test_col[pc_test_col['objID']==objID])
    for objID in df_X_test['objID']:
          X_test = X_test.append(pc_no_test_col[pc_no_test_col['objID']==objID])
          y_test = y_test.append(pc_test_col[pc_test_col['objID']==objID])    
        
    return X_train, X_test, y_train, y_test

In [31]:
X_train, X_test, y_train, y_test = split()

In [32]:
def normalize(df):
    if 'objID' in df.columns:
        df = df.drop('objID', axis=1)
    if 'pixelID' in df.columns:
        df = df.drop('pixelID', axis=1)        
    result = (df-df.min())/(df.max()-df.min())
    return result

In [33]:
X_train_normal, X_test_normal = map(normalize, [X_train, X_test])
y_train = y_train.drop('objID', axis=1)
y_test = y_test.drop('objID', axis=1)

In [34]:
print('X_train_normal')
print(X_train_normal.head(5))
print(len(X_train_normal))

print('X_test_normal')
print(X_test_normal.head(5))
print(len(X_test_normal))

print('y_train')
print(y_train.head(5))
print(len(y_train))

print('y_test')
print(y_test.head(5))
print(len(y_test))

X_train_normal
      lup_g     lup_i     lup_r     lup_u     lup_z
0  0.886099  0.854201  0.865673  0.925984  0.989057
1  0.917401  0.830035  0.863177  0.949768  0.990067
2  0.950848  0.794785  0.835422  0.925240  0.992169
3  0.932077  0.786521  0.876276  0.936203  0.990861
4  0.917401  0.806459  0.833456  0.910820  0.989168

[5 rows x 5 columns]
1445566
X_test_normal
      lup_g     lup_i     lup_r     lup_u     lup_z
0  0.963933  0.843354  0.846898  0.911298  0.991287
1  0.915602  0.743919  0.797403  0.896255  0.988779
2  0.924184  0.750543  0.793825  0.888679  0.989936
3  0.981909  0.796592  0.822178  0.864374  0.986020
4  0.875040  0.738422  0.826695  0.870578  0.989850

[5 rows x 5 columns]
379258
y_train
          z      zErr
0  0.151443  0.000023
1  0.151443  0.000023
2  0.151443  0.000023
3  0.151443  0.000023
4  0.151443  0.000023

[5 rows x 2 columns]
1445566
y_test
          z      zErr
0  0.521945  0.000184
1  0.521945  0.000184
2  0.521945  0.000184
3  0.521945  0.000184
4

In [35]:
def train_knn(X, y, n_neighbors=4):
    y = y_train.astype('float')
    knc = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)
    model = knc.fit(X, y)
    y_predict = pd.DataFrame(knc.predict(X_test_normal), columns=['z', 'zErr'])
    return model, y_predict

In [38]:
def compute_accuracy(X_train, X_test, y_train, y_test, start=1, end=51):

    k = np.arange(start, end)
    scores = []
    for i in k:
        model, y_predict = train_knn(X_train, y_train, i)
        score = model.score(X_test, y_test)
        scores.append(score)
        print ("n_neighbors = {}, score = {}".format(i, score))
    return scores

scores = compute_accuracy(X_train_normal, X_test_normal, y_train, y_test)

n_neighbors = 1, score = -1.47282594389
n_neighbors = 2, score = -1.47282594389
n_neighbors = 3, score = -1.47282594389
n_neighbors = 4, score = -1.46437896748
n_neighbors = 5, score = -1.43642350464
n_neighbors = 6, score = -1.42018865544
n_neighbors = 7, score = -1.41440172897
n_neighbors = 8, score = -1.41265992001
n_neighbors = 9, score = -1.41227749197
n_neighbors = 10, score = -1.32738417471
n_neighbors = 11, score = -1.2590061885
n_neighbors = 12, score = -1.20084815858
n_neighbors = 13, score = -1.15607578094
n_neighbors = 14, score = -1.07181157457
n_neighbors = 15, score = -1.00892773699
n_neighbors = 16, score = -0.95917224353
n_neighbors = 17, score = -0.922323812283
n_neighbors = 18, score = -0.888932056119
n_neighbors = 19, score = -0.857901778022
n_neighbors = 20, score = -0.830736480437
n_neighbors = 21, score = -0.808235612478
n_neighbors = 22, score = -0.789669699798
n_neighbors = 23, score = -0.765385697386
n_neighbors = 24, score = -0.743764129106
n_neighbors = 25, 

In [39]:
scores2 = compute_accuracy(X_train_normal, X_test_normal, y_train, y_test, start=51, end=101)

n_neighbors = 51, score = -0.453265832159
n_neighbors = 52, score = -0.448548966408
n_neighbors = 53, score = -0.443927722824
n_neighbors = 54, score = -0.439712269111
n_neighbors = 55, score = -0.435646609836
n_neighbors = 56, score = -0.431557170463
n_neighbors = 57, score = -0.427356482687
n_neighbors = 58, score = -0.423298103367
n_neighbors = 59, score = -0.419452709558
n_neighbors = 60, score = -0.415652096882
n_neighbors = 61, score = -0.411888682217
n_neighbors = 62, score = -0.408040252721


KeyboardInterrupt: 

In [None]:
fig,ax = plt.subplots()
ax.plot(np.arange(1,51), scores)
ax.plot(np.arange(51,101), scores2)
ax.set(xlabel='n_neighbors', ylabel='score', title="Finding Best k")

In [40]:
model, y_predict = train_knn(X_train_normal, y_train, 50)
print(y_predict)

           z      zErr
0   0.330527  0.000089
1   0.269237  0.000088
2   0.359227  0.000082
3   0.141716  0.000023
4   0.234113  0.000086
5   0.472293  0.000158
6   0.350388  0.000089
7   0.222904  0.000064
8   0.155735  0.000052
9   0.158838  0.000048
10  0.225534  0.000043
11  0.202905  0.000049
12  0.167674  0.000053
13  0.006058  0.000027
14  0.100199  0.000023
15  0.215034  0.000099
16  0.146823  0.000023
17  0.094818  0.000033
18  0.308618  0.000104
19  0.184835  0.000042
20  0.166305  0.000021
21  0.429912  0.000135
22  0.251275  0.000059
23  0.147776  0.000012
24  0.331595  0.000139
25  0.197360  0.000041
26  0.407427  0.000085
27  0.299672  0.000063
28  0.411632  0.000169
29  0.581712  0.000207
30  0.339813  0.000255
31  0.220891  0.000056
32  0.387830  0.000080
33  0.100447  0.000017
34  0.283183  0.000120
35  0.157690  0.000035
36  0.183312  0.000077
37  0.367822  0.000134
38  0.513755  0.000151
39  0.237114  0.000054
40  0.369590  0.000113
41  0.335857  0.000099
42  0.16415

In [41]:
print(y_test)

           z      zErr
0   0.521945  0.000184
1   0.521945  0.000184
2   0.521945  0.000184
3   0.521945  0.000184
4   0.521945  0.000184
5   0.521945  0.000184
6   0.521945  0.000184
7   0.521945  0.000184
8   0.521945  0.000184
9   0.521945  0.000184
10  0.521945  0.000184
11  0.521945  0.000184
12  0.521945  0.000184
13  0.521945  0.000184
14  0.521945  0.000184
15  0.521945  0.000184
16  0.521945  0.000184
17  0.521945  0.000184
18  0.521945  0.000184
19  0.521945  0.000184
20  0.521945  0.000184
21  0.521945  0.000184
22  0.521945  0.000184
23  0.521945  0.000184
24  0.521945  0.000184
25  0.521945  0.000184
26  0.521945  0.000184
27  0.521945  0.000184
28  0.521945  0.000184
29  0.521945  0.000184
30  0.521945  0.000184
31  0.521945  0.000184
32  0.521945  0.000184
33  0.521945  0.000184
34  0.521945  0.000184
35  0.521945  0.000184
36  0.521945  0.000184
37  0.521945  0.000184
38  0.521945  0.000184
39  0.521945  0.000184
40  0.521945  0.000184
41  0.521945  0.000184
42  0.52194

In [58]:
df_c = df[['z', 'zErr', 'dered_u', 'dered_g', 'dered_r', 'dered_i', 'dered_z']]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
                                                        df_c.drop(['z', 'zErr'], axis=1), 
                                                        df_c[['z', 'zErr']], 
                                                        test_size=0.2, 
                                                        random_state=check_random_state(0)
                                                        )

df_X_train_normal, df_X_test_normal= map(normalize, [df_X_train, df_X_test])
# df_model, df_y_predict = train_knn(df_X_train_normal, df_y_train, 50)

In [59]:
# print(df_y_predict)

In [60]:
# print(df_y_test)

In [2]:
y_test = y_test.set_index(np.arange(0, len(y_test)))
print(y_test)

NameError: name 'y_test' is not defined

In [1]:
fig1,ax1 = plt.subplots()
ax1.scatter(y_test, y_predict)


NameError: name 'plt' is not defined

In [None]:
# Clean up and make a pipeline
# Flux errors in the FITS file