In [1]:
from __future__ import print_function
from __future__ import division

import make_dictionaries
import os
import math
import params

import numpy as np
from astropy.io import fits
from astropy.table import Table
from scipy.stats import binned_statistic, scoreatpercentile
import pickle
from scipy.optimize import minimize
import time
from voronoi_2d_binning import voronoi_2d_binning
from sklearn.neighbors import NearestNeighbors
import pickle
import imp

In [2]:
import binning
import bin_debiasing
import fit_debiasing

In [3]:
%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
# better-looking plots
plt.rcParams['font.family'] = 'serif'
plt.rcParams['figure.figsize'] = (10.0, 8)
plt.rcParams['font.size'] = 18
mpl.ticker.AutoLocator.default_params['nbins'] = 5
mpl.ticker.AutoLocator.default_params['prune'] = 'both'

In [4]:
os.mkdir('output_files/') if os.path.isdir('output_files/') is False else None

In [5]:
source_directory = params.source_directory
save_directory = params.numpy_save_directory
full_sample = params.full_sample
volume_limited_sample = params.volume_limited_sample
question = params.question
bins_to_plot = params.bins_to_plot
#print('Question to be debiased: {}'.format(question))

min_log_fv = -1.5
max_log_fv = 0.01

In [7]:
full_data = Table.read(source_directory + full_sample)
print('Loaded galaxy data...')
questions = make_dictionaries.questions
print('Loaded questions...')
function_dictionary = make_dictionaries.function_dictionary
print('Loaded functions...')

Loaded galaxy data...
Loaded questions...
Loaded functions...


In [25]:
def debias_by_fit(data,full_data,vbins,zbins,zbins_coarse,function_dictionary,min_log_fv):
    
    fit_setup = fit_debiasing.get_best_function(data,vbins,zbins_coarse,function_dictionary,
                                            question,answer,min_log_fv)
    
    #fit_vbin_results = fit_debiasing.fit_vbin_function(data, vbins, zbins_coarse, fit_setup,
                                                   #question,answer,min_log_fv)
    
    fit_vbin_results = fit_debiasing.fit_vbin_function(data, vbins, zbins, fit_setup,
                                                   question,answer,min_log_fv)
    
    k_func,c_func = fit_debiasing.get_kc_functions(fit_vbin_results)
    
    kparams, cparams,dout, kmin, kmax, cmin, cmax = fit_debiasing.fit_mrz(fit_vbin_results, k_func,
                                                                      c_func,clip=2,plot=False)
    
    debiased_fit = fit_debiasing.debias(full_data,0.03, k_func,c_func, kparams, cparams,
                                    question,answer,kmin,kmax,cmin,cmax,fit_setup)

    # Debias ALL of the data 
    
    return debiased_fit,dout,fit_setup

In [33]:
# Calculate RMS of the volume limited sample: -----

def histogram_fractions(data,hist_bins):
    h,bin_edges = np.histogram(data,bins=hist_bins)
    f = h/np.sum(h)
    
    # Remove proportion of 0s and proportion of 1s:
    
    cf = np.linspace(0,1,len(data))
    plt.plot(np.sort(data),cf)
    indices = np.searchsorted(data,[10**(-5),1-(10)**(-5)])
    cf_range = cf[indices.clip(0, len(cf)-1)]
    
    return f,cf_range


def choose_best_function(vl_data,vl_fit,vl_bin,debiased_fit,debiased_bin):

    hist_bins = np.linspace(0,1,11)
    hist_bins[-1] = 2
    hist_bins[0] = -1

    # First divide the data into 10 redshift bins:
    z_range = [np.min(vl_data['REDSHIFT_1']),np.max(vl_data['REDSHIFT_1'])]
    z_bin_edges = np.linspace(z_range[0],z_range[1],11)
    z_bin_edges[0] = 0 # Ensure all data is binned
    z_bin_edges[-1] = 1 # Ensure all data is binned
    z_bin_assign = np.digitize(vl_data['REDSHIFT_1'],bins=z_bin_edges) # 10 bins

    low_z_reference = vl_data[z_bin_assign == 1][question + '_' + answer + '_weighted_fraction']

    rms_bin_array = np.zeros((10,10))
    rms_fit_array = np.zeros((10,10))

    f_reference = histogram_fractions(low_z_reference,hist_bins)

    for i,z_i in enumerate(np.unique(z_bin_assign)):
    
        high_z_select = z_bin_assign == z_i
        vl_fit_h = vl_fit[high_z_select]
        vl_bin_h = vl_bin[high_z_select]
    
        f_fit = histogram_fractions(vl_fit_h,hist_bins)
        f_bin = histogram_fractions(vl_bin_h,hist_bins)
    
        rms_fit_array[i] = np.absolute(f_fit - f_reference)
        rms_bin_array[i] = np.absolute(f_bin - f_reference)
        
    rms_bin_array[:,0] = 0 # ignore the lowest bin?
    rms_fit_array[:,0] = 0 # ignore the lowest bin?
    
    bin_residual = np.sum(rms_bin_array)/10
    fit_residual = np.sum(rms_fit_array)/10

    print('RMS residual from fitting method = {0:.3f}'.format(fit_residual))
    print('RMS residual from binning method = {0:.3f}'.format(bin_residual))

    if fit_residual > bin_residual:
        print('---> Binning method selected')
        debiased = debiased_bin.copy()
    else:
        print('---> Fitting method selected')
        debiased = debiased_fit.copy()
        
    return debiased

SyntaxError: invalid syntax (<ipython-input-33-050aad778514>, line 9)

In [27]:
def reduce_sample(full_data,questions,question,p_cut=0.5,N_cut=5):
    
    # Get the reference sample from the previous data:
    
    previous_q = questions[question]['pre_questions']
    previous_a = questions[question]['pre_answers']
    
    if previous_q != None:
        
        p_col = np.ones(len(full_data))
        
        for m in range(len(previous_q)):
            p_col = p_col*(full_data[previous_q[m] + '_' + previous_a[m] + '_debiased_rh'])
        N_col = (full_data[previous_q[-1] + '_' + previous_a[-1] + '_count'])
        
        select = (p_col > p_cut) & (N_col >= N_cut)
        data_reduced = full_data[select]
        print('{}/{} galaxies with p>{} and N>={}.'.format(len(data_reduced),
                                                          len(full_data),p_cut,N_cut))
    
    else:
        data_reduced = full_data.copy()
        print('Primary question, so all {} galaxies used.'.format(len(data_reduced)))
    
    return data_reduced

In [28]:
def get_bins(question,answer):
    
    bins = Table.read('output_files/'+ question + '/' + answer + '/bins.fits')
    all_bins = Table.read('output_files/'+ question + '/' + answer + '/all_bins.fits')
    vbins_table = Table.read('output_files/'+ question + '/' + answer + '/vbin_parameters.fits')
    
    vbins = bins['vbin']
    zbins = bins['zbin']
    zbins_coarse = bins['coarse_zbin']
    vbins_all = all_bins['vbin']
    zbins_all = all_bins['zbin']
    zbins_coarse_all = all_bins['coarse_zbin']
    
    return vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table

In [29]:
def bin_and_debias(full_data,question,questions,answer,bins_exist=False,n_per_bin=100):
    
    data = reduce_sample(full_data,questions,question)
    
    if bins_exist == True:
        vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table = get_bins(question,answer)
        print('Bins obtained from previous iteration...')
        
    else:
        vbins,zbins,zbins_coarse,vbins_all,zbins_all,zbins_coarse_all,vbins_table = binning.bin_data(data,full_data,question,answer,plot=False,signal=n_per_bin)
    
    debiased_bin = bin_debiasing.debias(data,full_data,vbins,zbins,vbins_all,zbins_all,question,answer)
    debiased_fit,fit_vbin_results,fit_setup = debias_by_fit(data,full_data,vbins,zbins,zbins_coarse,function_dictionary,min_log_fv)
    
    volume_ok = data['in_volume_limit'] == 1
    vl_data = full_data[volume_ok]
    vl_fit = debiased_fit[volume_ok]
    vl_bin = debiased_bin[volume_ok]

    debiased = choose_best_function(vl_data,vl_fit,vl_bin,debiased_fit,debiased_bin)
    full_data[question + '_' + answer + '_debiased_rh'] = debiased
    
    (os.mkdir('output_files/'+ question) if
     os.path.isdir('output_files/'+ question) is False else None)
    (os.mkdir('output_files/'+ question + '/' + answer) if
     os.path.isdir('output_files/'+ question + '/' + answer) is False else None)

    bin_table = Table([vbins,zbins,zbins_coarse],names=('vbin','zbin','coarse_zbin'))
    all_bin_table = Table([vbins_all,zbins_all,zbins_coarse_all],names=('vbin','zbin','coarse_zbin'))
    debiased_table = Table([debiased_bin,debiased_fit],names=('bin_method','fit_method'))

    bin_table.write('output_files/'+ question + '/' + answer + '/bins.fits',overwrite=True)
    all_bin_table.write('output_files/'+ question + '/' + answer + '/all_bins.fits',overwrite=True)
    debiased_table.write('output_files/'+ question + '/' + answer + '/debiased.fits',overwrite=True)
    fit_vbin_results.write('output_files/'+ question + '/' + answer + '/fit_results.fits',overwrite=True)
    vbins_table.write('output_files/'+ question + '/' + answer + '/vbin_parameters.fits',overwrite=True)
    
    pickle.dump(fit_setup,open('output_files/'+ question + '/' + answer + '/fit_setup.p', "wb" ))
    
    return debiased

In [30]:
question_order = [#'t01_smooth_or_features'
                  #,'t02_edgeon'
                  #,'t04_spiral'
                  't11_arms_number']

imp.reload(fit_debiasing)

for question in question_order:
    
    for answer in questions[question]['answers']:
        
        #bins_exist = os.path.isfile('output_files/'+ question + '/' + answer + '/bins.fits')
        bins_exist = False
        
        print('----------------------------------')
        print('Question to be debiased:',question)
        print('Answer to be debiased:',answer)
        debiased = bin_and_debias(full_data,question,questions,answer,bins_exist=bins_exist,n_per_bin=40)
        print('----------------------------------')

----------------------------------
Question to be debiased: t11_arms_number
Answer to be debiased: a31_1
54961/228201 galaxies with p>0.5 and N>=5.
Bin-accretion...
238  initial bins.
Reassign bad bins...
21  good bins.
Modified Lloyd algorithm...
19  iterations.
Unbinned pixels:  0  /  4206
Fractional S/N scatter (%): 10.4354320717
21 voronoi bins
30.857142857142858 redshift bins per voronoi bin
All bins fitted! 8.815732717514038s in total
chisq(logistic) = 0.0017547946934098593
All bins fitted! 14.637715816497803s in total
chisq(exp. power) = 1.19810033394046e-05
All bins fitted! 73.23438382148743s in total
RMS residual from fitting method = 0.143
RMS residual from binning method = 0.163
---> Fitting method selected
----------------------------------
----------------------------------
Question to be debiased: t11_arms_number
Answer to be debiased: a32_2
54961/228201 galaxies with p>0.5 and N>=5.
Bin-accretion...
384  initial bins.
Reassign bad bins...
23  good bins.
Modified Lloyd al

In [31]:
full_data.write(source_directory + 'full_sample_debiased.fits',overwrite=True)

In [32]:
debiased_columns = ['t01_smooth_or_features_a01_smooth_debiased_rh',
                    't01_smooth_or_features_a02_features_or_disk_debiased_rh',
                    't01_smooth_or_features_a03_star_or_artifact_debiased_rh',
                    't02_edgeon_a04_yes_debiased_rh',
                    't02_edgeon_a05_no_debiased_rh',
                    't04_spiral_a08_spiral_debiased_rh',
                    't04_spiral_a09_no_spiral_debiased_rh',
                    't11_arms_number_a31_1_debiased_rh',
                    't11_arms_number_a32_2_debiased_rh',
                    't11_arms_number_a33_3_debiased_rh',
                    't11_arms_number_a34_4_debiased_rh',
                    't11_arms_number_a36_more_than_4_debiased_rh',
                    't11_arms_number_a37_cant_tell_debiased_rh']

debiased_values = full_data[debiased_columns]
debiased_values.write(source_directory + 'debiased_values.fits',overwrite=True)

In [None]:
imp.reload(fit_debiasing)
imp.reload(make_dictionaries)
questions = make_dictionaries.questions
print('Loaded questions...')
function_dictionary = make_dictionaries.function_dictionary
print('Loaded functions...')

In [None]:
def make_fit_setup(function_dictionary,key):
    fit_setup = {}
    fit_setup['func'] = function_dictionary['func'][key]
    fit_setup['bounds'] = function_dictionary['bounds'][key]
    fit_setup['p0'] = function_dictionary['p0'][key]
    fit_setup['inverse'] = function_dictionary['i_func'][key]
    return fit_setup


def chisq_fun(p, f, x, y):
    return ((f(x, *p) - y)**2).sum()


data = reduce_sample(full_data,questions,question)

fv_all = np.sort(data[question + '_' + answer + '_weighted_fraction'])
fv_nonzero = fv_all != 0
cf = np.linspace(0,1,len(fv_all))
x,y = [np.log10(fv_all[fv_nonzero]),cf[fv_nonzero]]
    
x_fit = np.log10(np.linspace(10**(min_log_fv), 1, 100))
indices = np.searchsorted(x,x_fit)
y_fit = y[indices]
    
chisq_tot = np.zeros(len(function_dictionary['func'].keys()))
k_tot = np.zeros(len(function_dictionary['func'].keys()))
c_tot = np.zeros(len(function_dictionary['func'].keys()))
    
for n,key in enumerate(function_dictionary['func'].keys()):
        # Overall data fitting:
    fit_setup = make_fit_setup(function_dictionary,key)
    
    #print(fit_setup)
    #func = fit_setup['func']
    #p0 = fit_setup['p0']
    #bounds = fit_setup['bounds']
    
    #print(p0)
        
    #res =  minimize(chisq_fun, p0,
                    #args=(func,x_fit,y_fit),
                    #bounds=bounds,method='SLSQP')
        
    #function_dictionary['p0'][key] = res.x

In [None]:
func = function_dictionary['func'][1]
bounds = function_dictionary['bounds'][1]
p0 = function_dictionary['p0'][1]

data = reduce_sample(full_data,questions,question)
bins = Table.read('output_files/'+ question + '/' + answer + '/bins.fits')
vbins = bins['vbin']
zbins = bins['coarse_zbin']

even_sampling = True

In [None]:
v = 10
z = 3

fv = question + '_' + answer + '_weighted_fraction'

vselect = vbins == v
data_v = data[vselect]
zbins_v = zbins[vselect]

z_bins_unique = np.unique(zbins_v)

data_z = data_v[zbins_v == z]
n = len(data_z)

min_fv = 10**(-1.5)
            
D = data_z[[fv]]
D.sort(fv)
D['cumfrac'] = np.linspace(0, 1, n)
D = D[D[fv] > min_fv]
D['log10fv'] = np.log10(D[fv])

#print(len(D[(D['log10fv'] > min_log_fv)]))
#plt.plot(D['log10fv'],D['cumfrac'],'b--',linewidth=2)

if even_sampling:
    D_fit_log10fv = np.log10(np.linspace(10**(min_log_fv), 1, 100))
    D = D[(D['log10fv'] > min_log_fv)] #& (D['log10fv'] < max_log_fv)]
    indices = np.searchsorted(D['log10fv'], D_fit_log10fv)
    D_fit = D[indices.clip(0, len(D)-1)]
else:
    D_fit = D[D['log10fv'] > min_log_fv]

res = minimize(chisq_fun, p0,
                args=(func,
                      D_fit['log10fv'].astype(np.float64),
                      D_fit['cumfrac'].astype(np.float64)),
                      bounds=bounds, method='SLSQP')
            
p = res.x

xg = np.linspace(-1.2,0,1000)

plt.plot(D_fit['log10fv'],D_fit['cumfrac'],'k-',linewidth=2)
plt.plot(xg,func(xg,*p),'r--')