In [2]:
from __future__ import print_function
from __future__ import division

import os
import math

import numpy as np
from astropy.io import fits
from astropy.table import Table
from scipy.stats import binned_statistic, scoreatpercentile
import pickle
from scipy.optimize import minimize
import time
from voronoi_2d_binning import voronoi_2d_binning
from sklearn.neighbors import NearestNeighbors
import pickle
import imp # reload modules if necessary

In [3]:
import binning
import bin_debiasing
import fit_debiasing
import make_dictionaries
import params

In [4]:
os.mkdir('output_files/') if os.path.isdir('output_files/') is False else None

source_directory = params.source_directory
full_sample = params.full_sample

#save_directory = params.numpy_save_directory

min_log_fv = -1.5
max_log_fv = 0.01 # if >0, there is no upper limit to fitting fv.

In [5]:
#full_data = Table.read(source_directory + full_sample)
full_data = Table.read(source_directory + 'full_sample_debiased_base.fits')
print('Loaded galaxy data...')
questions = make_dictionaries.questions
print('Loaded questions...')
function_dictionary = make_dictionaries.function_dictionary
print('Loaded functions...')

Loaded galaxy data...
Loaded questions...
Loaded functions...


In [6]:
def reduce_sample(full_data,questions,question,p_cut=0.5,N_cut=5,normalised_values=True):
    
    # Get the reference sample from the previous data:
    
    previous_q = questions[question]['pre_questions']
    previous_a = questions[question]['pre_answers']
    
    if normalised_values == True:
        suffix = '_debiased_rh'
    else:
        suffix = '_debiased_rh_normalised'
    
    if previous_q != None:
        
        p_col = np.ones(len(full_data))
        
        for m in range(len(previous_q)):
            p_col = p_col*(full_data[previous_q[m] + '_' + previous_a[m] + suffix])
        N_col = (full_data[previous_q[-1] + '_' + previous_a[-1] + '_count'])
        
        select = (p_col > p_cut) & (N_col >= N_cut)
        data_reduced = full_data[select]
        print('{}/{} galaxies with p>{} and N>={}.'.format(len(data_reduced),
                                                          len(full_data),p_cut,N_cut))
    
    else:
        data_reduced = full_data.copy()
        print('Primary question, so all {} galaxies used.'.format(len(data_reduced)))
    
    return data_reduced

In [7]:
def get_bins(question,answer):
    '''Get bins from if they have already been created from a 
    previous running of the debiasing'''
    
    bins = Table.read('output_files/'+ question + '/' + answer + '/bins.fits')
    all_bins = Table.read('output_files/'+ question + '/' + answer + '/all_bins.fits')
    vbins_table = Table.read('output_files/'+ question + '/' + answer + '/vbin_parameters.fits')
    
    vbins = bins['vbin']
    zbins = bins['zbin']
    zbins_coarse = bins['coarse_zbin']
    vbins_all = all_bins['vbin']
    zbins_all = all_bins['zbin']
    zbins_coarse_all = all_bins['coarse_zbin']
    
    return vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table

In [8]:
def get_01_range(dataset):
    '''Returns proportion of 0s and 1s to be 'excluded' from the histograms'''
    cf_low = np.sum(dataset == 0)/len(dataset)
    N_1 = np.sum(dataset == 1)/len(dataset)
    cf_high = 1-N_1
    
    return cf_low,cf_high


def set_01_values(dataset,cf_low,cf_high):
    '''Set the top and bottom ends to 0 and 1, to avoid 'false' rms values from 'undebiasable' values'''
    
    cf = np.linspace(0,1,len(dataset))
    d_sorted = np.sort(dataset)
    
    indices = np.searchsorted(cf,[cf_low,cf_high])
    indices = indices.clip(0,len(cf)-1)
    
    d_sorted[0:indices[0]] = 0
    d_sorted[indices[1]:] = 1
    
    return d_sorted


def histogram_fractions(data,hist_bins):
    h,bin_edges = np.histogram(data,bins=hist_bins)
    f = h/np.sum(h)
    return f


def get_rms(dataset,z_assignments,reference,hist_bins):
    
    ref_low,ref_high = get_01_range(reference)
    
    x = len(hist_bins) - 1
    y = len(np.unique(z_assignments))
    rms_array = np.zeros((x,y))

    for n,z in enumerate(np.unique(z_assignments)):
    
        ref = reference.copy()
        vl_deb = dataset[z_assignments == z]

        deb_low,deb_high = get_01_range(vl_deb)
        cf_low = np.max([ref_low,deb_low])
        cf_high = np.min([ref_high,deb_high])
    
        vl_deb_01 = set_01_values(vl_deb,cf_low,cf_high)
        ref_01 = set_01_values(ref,cf_low,cf_high)
    
        f_deb = histogram_fractions(vl_deb_01,hist_bins)
        f_ref = histogram_fractions(ref_01,hist_bins)
        
        rms_array[:,n] = np.absolute(f_deb - f_ref)
    
    rms_value = np.mean(rms_array)  
    
    return rms_value

In [9]:
def choose_best_function(raw_data,debiased,question,answer):
    
    volume_ok = raw_data['in_volume_limit'] == 1
    
    vl  = raw_data[volume_ok][question + '_' + answer + '_weighted_fraction']
    vl_bin = debiased['bin_method'][volume_ok]
    vl_fit = debiased['fit_method'][volume_ok]
    
    redshifts = full_data['REDSHIFT_1'][volume_ok]
    z_range = [np.min(redshifts),np.max(redshifts)]
    z_vl_bins = np.linspace(z_range[0],z_range[1],11) # have 11 bins for now
    z_vl_bins[0],z_vl_bins[-1] = [0,1] # ensure all data gets binned
    z_assignments = np.digitize(redshifts,z_vl_bins)
    
    hist_bins = np.linspace(0,1,11)
    hist_bins[0],hist_bins[-1] = [-1,2] # ensure all data gets binned

    reference = vl[z_assignments == 1] # raw low-z for comparison
    
    rms_bin = get_rms(vl_bin,z_assignments,reference,hist_bins)
    rms_fit = get_rms(vl_fit,z_assignments,reference,hist_bins)
    
    print('rms(bin) = {0:.3f}'.format(rms_bin))
    print('rms(fit) = {0:.3f}'.format(rms_fit))
    if rms_bin < rms_fit:
        print('---> bin method selected')
        debiased_values = debiased['bin_method']
    else:
        print('---> fit method selected')
        debiased_values = debiased['fit_method']
        
    return debiased_values

In [10]:
def bin_and_debias(full_data,question,questions,answer,bins_exist=False,n_per_bin=100,coarse=False):
    '''Set to 'coarse' to make the fitting only apply to the 'coarse binning'of 4 redshift bins per 
    voronoi bin rather than the fully binned data'''
    
    (os.mkdir('output_files/'+ question) if
     os.path.isdir('output_files/'+ question) is False else None)
    (os.mkdir('output_files/'+ question + '/' + answer) if
     os.path.isdir('output_files/'+ question + '/' + answer) is False else None)
    
    data = reduce_sample(full_data,questions,question)
    
    if bins_exist == True:
        vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table = get_bins(question,answer)
        print('Bins obtained from previous iteration...')
        
    else:
        vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table = binning.bin_data(data,
                                                                                                     full_data,
                                                                                                     question,
                                                                                                     answer,
                                                                                                     plot=False,
                                                                                                     signal=n_per_bin)
        
    # Save the binning data  
    bin_table = Table([vbins,zbins,zbins_coarse],names=('vbin','zbin','coarse_zbin'))
    all_bin_table = Table([vbins_all,zbins_all,zbins_coarse_all],names=('vbin','zbin','coarse_zbin'))
    bin_table.write('output_files/'+ question + '/' + answer + '/bins.fits',overwrite=True)
    all_bin_table.write('output_files/'+ question + '/' + answer + '/all_bins.fits',overwrite=True)
    vbins_table.write('output_files/'+ question + '/' + answer + '/vbin_parameters.fits',overwrite=True)

    
    debiased_bin = bin_debiasing.debias(data,full_data,vbins,zbins,vbins_all,zbins_all,question,answer)
    debiased_fit,dout,fit_setup,zbins,fit_vbin_results = fit_debiasing.debias_by_fit(data,full_data,vbins,zbins,
                                                                                     zbins_coarse,question,answer,
                                                                                     function_dictionary,min_log_fv,
                                                                                     coarse=coarse)
    
    volume_ok = data['in_volume_limit'] == 1    
    vl_data = full_data[volume_ok]
    vl_fit = debiased_fit[volume_ok]
    vl_bin = debiased_bin[volume_ok]

    debiased_table = Table([debiased_bin,debiased_fit],names=('bin_method','fit_method'))
    debiased_table.write('output_files/'+ question + '/' + answer + '/debiased.fits',overwrite=True)
    dout.write('output_files/'+ question + '/' + answer + '/fit_results.fits',overwrite=True)
    pickle.dump(fit_setup,open('output_files/'+ question + '/' + answer + '/fit_setup.p', "wb" ))
    
    return debiased_table

In [11]:
imp.reload(fit_debiasing)
'''
question_order = ['t01_smooth_or_features',
                  't02_edgeon',
                  't04_spiral',
                  't11_arms_number']
'''


question_order = ['t04_spiral']

for question in question_order:
    answers = questions[question]['answers']
    #answers = ['a36_more_than_4']
    for answer in answers:
        
        #bins_exist = os.path.isfile('output_files/'+ question + '/' + answer + '/bins.fits')
        bins_exist = False
        
        print('----------------------------------')
        print('Question to be debiased:',question)
        print('Answer to be debiased:',answer)
        
        debiased = bin_and_debias(full_data,question,questions,answer,
                                  bins_exist=bins_exist,n_per_bin=100,coarse=True) # set to coarse to test method.
        
        deb_vals = choose_best_function(full_data,debiased,question,answer)
        full_data[question + '_' + answer + '_debiased_rh'] = deb_vals
        
        print('----------------------------------')

    debiased_values = np.array([full_data[question + '_' + a + '_debiased_rh'] for a in answers])
    debiased_norm = debiased_values/np.sum(debiased_values,axis=0)
    debiased_norm[np.isnan(debiased_norm)] = 0
    for m in range(len(debiased_norm)):
        full_data[question + '_' + answers[m] + '_debiased_rh_normalised'] = debiased_norm[m]

----------------------------------
Question to be debiased: t04_spiral
Answer to be debiased: a08_spiral
95560/219212 galaxies with p>0.5 and N>=5.
Bin-accretion...
655  initial bins.
Reassign bad bins...
20  good bins.
Modified Lloyd algorithm...
18  iterations.
Unbinned pixels:  0  /  13115
Fractional S/N scatter (%): 10.7703134276
20 voronoi bins
42.2 redshift bins per voronoi bin
All bins fitted! 26.098473072052002s in total
chisq(logistic) = 0.00043103927162539744
All bins fitted! 19.00503897666931s in total
chisq(exp. power) = 4.8797415615015555e-05
All bins fitted! 18.308588981628418s in total

  term = constant*np.log10(var)
  ret = ret.dtype.type(ret / rcount)
  k[k < kmin] = kmin



rms(bin) = 0.025
rms(fit) = 0.025
---> fit method selected
----------------------------------
----------------------------------
Question to be debiased: t04_spiral
Answer to be debiased: a09_no_spiral
95560/219212 galaxies with p>0.5 and N>=5.
Bin-accretion...
565  initial bins.
Reassign bad bins...
21  good bins.
Modified Lloyd algorithm...
19  iterations.
Unbinned pixels:  0  /  12079
Fractional S/N scatter (%): 13.1365413299
21 voronoi bins
40.476190476190474 redshift bins per voronoi bin
All bins fitted! 22.333016872406006s in total
chisq(logistic) = 0.0003983463188977766
All bins fitted! 22.603487014770508s in total
chisq(exp. power) = 0.0001328481800800105
All bins fitted! 23.823745012283325s in total

  k[k > kmax] = kmax
  kb[kb < kmin] = kmin
  kb[kb > kmax] = kmax
  ok = k > 0
  c[c < cmin] = cmin



rms(bin) = 0.023

  c[c > cmax] = cmax
  cb[cb < cmin] = cmin
  cb[cb > cmax] = cmax



rms(fit) = 0.027
---> bin method selected
----------------------------------


In [12]:
debiased_columns = []

for m in range(len(full_data.colnames)):
    c = full_data.colnames[m]
    if 'debiased_rh' in c:
        debiased_columns.append(c)
        
debiased_values = full_data[debiased_columns]
debiased_values.write(source_directory + 'debiased_values.fits',overwrite=True)

In [None]:
full_data.write(source_directory + 'full_sample_debiased.fits',overwrite=True)