In [12]:
import numpy as np
import matplotlib.pyplot as plt
import sjoert.stellar
import pandas as pd
import os
from tqdm import tqdm
from cleaning_functions import q_cuts,field_check,filter_split,flux_unc_val,clean_data
from astropy import coordinates as coord
import json
from numpyencoder import NumpyEncoder

sjoertpath = r'C:\Users\timvd\Documents\Uni 2023-2024\First Research Project\Data\Sjoert_Flares'

In [13]:
# # Code to remove all clean files and clean logs from the sjoert_flares directory.
# for folder in os.listdir(sjoertpath)[::-1]:
#     if 'ZTF' in folder:
#         folderpath = os.path.join(sjoertpath,folder)
#         for file in os.listdir(folderpath):
#             if 'clean' in file:
#                 os.remove(os.path.join(folderpath,file))

1. Geen field check - de field_id is namelijk belangrijk voor de log. Cleaning gaat wel nog steeds per field, per field sla je alles op.
2. ir data houden we er ook gewoon in. 
3. Iteratief over fields en daarna over filters met de nodige checks of er wel data is om mee te werken. DIT STAAT NU VERKEERD OM. We werken onder de assumptie dat voor elke field er data is in zowel r als g, anders zet je er gewoon een try except omheen.
4. De clean data die we opslaan is dat in de primary field, voor de log hebben we info van elke field nodig 
5. clean de data in de primary field per filter en sla op (met de nodige check of er wel ztf_g en ztf_r data is). 
6. ~Sla in de log file ook op: 'r_and_g':True of False, deze geeft aan of er viable data is van zowel r en g in de primary field~
7. We willen niet meer een per filter file hebben dus dat scheelt weer een heleboel code, we willen alleen "clean_data_full" hebben nu 
8. Laten we data gesorteerd houden op time 
9. chi2 kan in de cleaning_log voor elke field maar chi2_after kan alleen voor de primary, dus slaan we per filter op los van de field_id keys

### VRAGEN EN TODO'S VOOR CLEANING
- Vraag: Checken we de primary field *in een filter* of checken we puur de primary field? Als the latter, doen we dan de primary field van de uncleaned data of de data na qcuts?
- To-do: Sla de clean data op. Het enige wat dan nog mist is de bijgewerkte uncertainty. Deze gaat op volgorde van filter staan. Zorg heel goed dat de uncertainty bij het juistse datapunt staat!!! Het zal betekenen dat we toch ordenen op filter ipv tijd maar dat is allemaal niet erg. Waarschijnlijk hoe we dit moeten doen is een clean_data maken per filter en deze stacken op een clean_data_full dataframe met pd.concat. Begin met clean_data_full als een lege dataframe en stack vervolgens steeds [full,clean_data_filter] in de primary field en sla dat buiten de filter loop op. 
- To-do: Sla de logdict op naar een json bestand (met dumps als het goed is) nadat de loops voltooid zijn.


In [14]:
def clean_data(datapath,ZTF_ID,savepath=None,verbose=False):
    """Cleans ZTF batch request data using the qcuts function. The removed data is neatly logged on a per filter - per field basis. The cleaning
    now works in such a way that only the data from the primary field (defined as the field in which most measurements were made) is used.
    The uncertainty on the flux measurements is validated and subsequently updated following the method of the ZFPS user guide: 
    https://web.ipac.caltech.edu/staff/fmasci/ztf/zfps_userguide.pdf. The cleaned data contains: time in jd, forced difference image PSF-fit flux [DN],
      the 1-sigma uncertainty in the forced difference image PSF-fit flux [DN], the photometric zeropoint for difference image [mag] and the filter 
      (one of ZTF_g, ZTF_r and ZTF_i). The cleaning log contains, for every field in every filter, whether there are even viable measurements and if
      there are if the field in question is the primary field, what the median zeropoint is, what the standard deviation of the zeropoints is, how
      many of the data points were removed in cleaning and the median chi-square of the datapoints before cleaning. For every filter the median 
      chi-square after cleaning is saved only for the primary field - this will differ only slightly from the median chi-square before cleaning. 
      Both cleaned data and cleaning log are saved as json files in the form "(ZTF_ID)_clean_data.json" and "(ZTF_ID)_clean_log.json".
      
      IMPORTANT: dependencies are the numpy, pandas, json and os packages as well as a special json NumpyEncoder by Hunter M. Allen (https://pypi.org/project/numpyencoder/).

    Args:
        datapath (str): Path to the raw data in the form path\to\data\batchf_reqxxxxxxxxxxxxxxxxx_lc.txt.
        ZTF_ID (str): ZTF identifier of the transient.
        savepath (str, optional): Path to folder in which clean data and cleaning log will be saved. Defaults to None, in which case data is printed if verbose is True, otherwise it is lost.
        verbose (bool, optional): Controls the (amount of) print statements in the function. Defaults to False.
    """
    #Read in the raw data from the data path as a Pandas DataFrame. 
    columns = ['sindex', 'field', 'ccdid', 'qid', 'filter', 'pid', 'infobitssci', 'sciinpseeing', 'scibckgnd', 'scisigpix', 'zpmaginpsci', 'zpmaginpsciunc', 'zpmaginpscirms', 'clrcoeff', 'clrcoeffunc', 'ncalmatches', 'exptime', 'adpctdif1', 'adpctdif2', 'diffmaglim', 'zpdiff', 'programid', 'jd', 'rfid', 'forcediffimflux', 'forcediffimfluxunc', 'forcediffimsnr', 'forcediffimchisq', 'forcediffimfluxap', 'forcediffimfluxuncap', 'forcediffimsnrap', 'aperturecorr', 'dnearestrefsrc', 'nearestrefmag', 'nearestrefmagunc', 'nearestrefchi', 'nearestrefsharp', 'refjdstart', 'refjdend', 'procstatus']
    dtypes = [(columns[x],float) for x in range(len(columns))]
    dtypes[4] = ('filter',r'U8')
    data = pd.DataFrame(np.genfromtxt(datapath,skip_header=53,dtype=dtypes))
 
    clean_data_full = pd.DataFrame() #an empty frame on which the data from every filter will be vertically stacked.
    iok = q_cuts(data) #very first quality check, mask array of good data points.
    data_ok = data[iok]
    logdict = {} #dictionary that will form the log.json file

    filters = np.unique(data['filter'])
    filtermasks = [data['filter'] == f for f in filters]
    # fields,field_counts = np.unique(data['field'],return_counts=True) #return_counts for picking the primary field
    fields,_ = np.unique(data_ok['field'],return_counts=True) #Do we want the primary field on data or on data_ok?
    fieldmasks = [data['field'] == fid for fid in fields]

    if iok.sum() == 0: #this might occur, this prevents an error
        print(f"{ZTF_ID}: no viable data found in the batch request. Proceeding to next file.")
        return 
    
    for i,filter in enumerate(filters):
            logdict[filter] = {}
            logdict[filter]["no_viable_data"] = 0 #can be used for a check when loading in the data; if this is True then the data is useless in this particular filter 
            filtermask = filtermasks[i]
            iok_filter = (iok * filtermask) #this checks if something is ok according to qcuts and is in a certain filter. Has the same len as data.
            
            if iok_filter.sum() == 0: #this might occur, this prevents an error
                print(f"{ZTF_ID}: no viable data found in {filter}. Proceeding to next filter.")
                logdict[filter]["no_viable_data"] = 1
                continue

            #If we take the primary field on a per filter basis use three lines below.
            data_ok_filter = data[iok_filter]
            filter_field_counts = [np.sum(data_ok_filter['field'] == fid) for fid in fields] #count for each field we know to have in the uncleaned data how often it appears in this filter. Might yield 0's! 
            primary_field = [c == np.max(filter_field_counts) for c in filter_field_counts] #should be this one if we want to pick the primary on a per filter basis

            for j,fid in enumerate(fields):
                field_mask = fieldmasks[j]
                iok_filter_field = iok_filter * field_mask #this checks if something is ok according to qcuts, is in a certain filter and is in a certain field. Has the same len as data.

                logdict[filter][fid] = {}
                if iok_filter_field.sum() == 0: #this might occur, this prevents an error
                    print(f"{ZTF_ID}: no viable data found in field {fid} of filter {filter}. Proceeding to next field for this filter.")
                    logdict[filter][fid]["no_viable_data"] = 1 #can be used for a check when loading in the data; if this is True then the data is useless in this particular field / filter combo
                    continue

                data_ok_filter_field = data[iok_filter_field]
                data_filter_field = data[filtermask*field_mask] #this is the uncleaned data of this field in this filter
                zeropoint = data_ok_filter_field['zpdiff'].values

                logdict[filter][fid] = {"primary_field":int(primary_field[j]),
                                        "median_zeropoint":np.median(zeropoint),'std_zeropoint':np.std(zeropoint),
                                        "removed_in_cleaning":np.sum(np.invert(iok_filter_field)),
                                        "median_chi2":np.median(data_filter_field['forcediffimchisq']),
                                            "no_viable_data":0}

                if primary_field[j]:
                    #correct the errors of the clean data in this filter (only on the primary field)
                    new_unc = np.array(flux_unc_val(data_ok_filter_field))
                    #the median chi squared after is that of the good data in the primary field of the respective filter
                    logdict[filter]["median_chi2_after"] = np.median(data_ok_filter_field['forcediffimchisq']) 
                    clean_data_filt = pd.DataFrame({'time':data_ok_filter_field['jd'],'flux':data_ok_filter_field['forcediffimflux'],
                                                   'flux_unc':new_unc,'zeropoint':data_ok_filter_field['zpdiff'],
                                                   'filter':data_ok_filter_field['filter']})
                    clean_data_full = pd.concat([clean_data_full,clean_data_filt],ignore_index=True)
        
    if savepath != None:
        clean_data_full.to_json(os.path.join(savepath,str(ZTF_ID)+'_clean_data.json'))
        with open(os.path.join(savepath,str(ZTF_ID)+'_clean_log.json'),'w') as outfile:
            json.dump(logdict,outfile,indent=4,ensure_ascii=False,separators=(',',':'),cls=NumpyEncoder)
    else:
        print('No savepath provided. Dumping results, shown if verbose set to True.')
        if verbose:
            print(clean_data_full.to_markdown())
            print()
            print(logdict)
                    


In [15]:
PATH = os.path.normpath(os.getcwd() + os.sep + os.pardir)
DATAPATH = os.path.join(PATH,'Data')
CODEPATH = os.path.join(PATH,'Code')
columns = ['sindex', 'field', 'ccdid', 'qid', 'filter', 'pid', 'infobitssci', 'sciinpseeing', 'scibckgnd', 'scisigpix', 'zpmaginpsci', 'zpmaginpsciunc', 'zpmaginpscirms', 'clrcoeff', 'clrcoeffunc', 'ncalmatches', 'exptime', 'adpctdif1', 'adpctdif2', 'diffmaglim', 'zpdiff', 'programid', 'jd', 'rfid', 'forcediffimflux', 'forcediffimfluxunc', 'forcediffimsnr', 'forcediffimchisq', 'forcediffimfluxap', 'forcediffimfluxuncap', 'forcediffimsnrap', 'aperturecorr', 'dnearestrefsrc', 'nearestrefmag', 'nearestrefmagunc', 'nearestrefchi', 'nearestrefsharp', 'refjdstart', 'refjdend', 'procstatus']
dtypes = [(columns[x],float) for x in range(len(columns))]
dtypes[4] = ('filter',r'U8')

test_path = os.path.join(DATAPATH,r"18")
test_path = os.path.join(sjoertpath,'ZTF18aajupnt')
ztf_id = os.path.split(test_path)[-1]
print(test_path,ztf_id)

test_lc = pd.DataFrame(np.genfromtxt(os.path.join(test_path,os.listdir(test_path)[0]),skip_header=53,dtype=dtypes))
test_lc

C:\Users\timvd\Documents\Uni 2023-2024\First Research Project\Data\Sjoert_Flares\ZTF18aajupnt ZTF18aajupnt


Unnamed: 0,sindex,field,ccdid,qid,filter,pid,infobitssci,sciinpseeing,scibckgnd,scisigpix,...,forcediffimsnrap,aperturecorr,dnearestrefsrc,nearestrefmag,nearestrefmagunc,nearestrefchi,nearestrefsharp,refjdstart,refjdend,procstatus
0,0.0,760.0,3.0,4.0,ZTF_r,5.471801e+11,0.0,2.0042,155.945,5.77730,...,11.847634,1.035751,0.417786,16.248,0.072,5.694,0.459,2.458161e+06,2.458218e+06,0.0
1,1.0,760.0,3.0,4.0,ZTF_g,5.472616e+11,0.0,1.9562,161.211,6.83362,...,18.009733,1.046029,0.370684,17.141,0.053,4.449,0.327,2.458167e+06,2.458206e+06,0.0
2,2.0,760.0,3.0,4.0,ZTF_g,5.472807e+11,0.0,1.9662,160.764,7.57993,...,12.479790,1.045941,0.370684,17.141,0.053,4.449,0.327,2.458167e+06,2.458206e+06,0.0
3,3.0,760.0,3.0,4.0,ZTF_g,5.473041e+11,0.0,1.8473,158.989,8.20205,...,11.572918,1.043689,0.370684,17.141,0.053,4.449,0.327,2.458167e+06,2.458206e+06,0.0
4,4.0,760.0,3.0,4.0,ZTF_r,5.473478e+11,0.0,1.5876,149.514,8.43294,...,11.822746,1.039321,0.417786,16.248,0.072,5.694,0.459,2.458161e+06,2.458218e+06,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,2028.0,760.0,3.0,4.0,ZTF_g,2.440165e+12,0.0,2.2933,173.912,6.54077,...,-0.822176,1.032331,0.370684,17.141,0.053,4.449,0.327,2.458167e+06,2.458206e+06,0.0
2029,2029.0,760.0,3.0,4.0,ZTF_r,2.440201e+12,0.0,1.6948,165.425,8.29566,...,0.279338,1.034331,0.417786,16.248,0.072,5.694,0.459,2.458161e+06,2.458218e+06,0.0
2030,2030.0,760.0,3.0,4.0,ZTF_g,2.466187e+12,0.0,3.8638,131.229,14.11160,...,-1.608354,1.107827,0.370684,17.141,0.053,4.449,0.327,2.458167e+06,2.458206e+06,0.0
2031,2031.0,760.0,3.0,4.0,ZTF_r,2.468141e+12,0.0,3.0778,161.122,9.85148,...,0.152064,1.092632,0.417786,16.248,0.072,5.694,0.459,2.458161e+06,2.458218e+06,0.0


In [29]:
file = [f if 'batch' in f else None for f in os.listdir(test_path)][0]
testdatapath = os.path.join(test_path,file)
testdatapath
clean_data(testdatapath,ztf_id,savepath=test_path,verbose=True)

ZTF18aajupnt: no viable data found in field 1759.0 of filter ZTF_i. Proceeding to next field for this filter.


In [32]:
testjson = pd.read_json(r'C:\Users\timvd\Documents\Uni 2023-2024\First Research Project\Data\Sjoert_Flares\ZTF18aajupnt\ZTF18aajupnt_clean_data.json')
testjson.sort_values(by='time')

Unnamed: 0,time,flux,flux_unc,zeropoint,filter
962,2.458302e+06,524.733793,33.544278,26.1564,ZTF_r
0,2.458302e+06,832.981575,33.913608,26.2606,ZTF_g
1,2.458302e+06,845.350094,39.441217,26.2570,ZTF_g
2,2.458302e+06,781.155603,40.041358,26.2334,ZTF_g
963,2.458302e+06,782.811927,39.301814,26.0931,ZTF_r
...,...,...,...,...,...
824,2.460195e+06,56.214740,33.988009,26.3851,ZTF_g
1820,2.460195e+06,-75.998616,39.471409,26.2008,ZTF_r
825,2.460221e+06,-127.913078,101.197538,26.0624,ZTF_g
1821,2.460223e+06,-110.053750,72.606640,26.1893,ZTF_r


In [33]:
for folder in os.listdir(sjoertpath)[::-1]:
    if "ZTF" in folder:
        folderpath = os.path.join(sjoertpath,folder)
        ztf_id = os.path.split(folderpath)[-1]
        for file in os.listdir(folderpath):
            if 'clean' not in file:
                filepath = os.path.join(folderpath,file)
                clean_data(filepath,ztf_id,savepath=folderpath)
                


ZTF20aagkqky: no viable data found in field 1869.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF20aabhraq: no viable data found in field 1510.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF20aabhraq: no viable data found in field 1511.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF20aabhraq: no viable data found in field 1510.0 of filter ZTF_r. Proceeding to next field for this filter.
ZTF19adcddzk: no viable data found in field 1749.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF19adawqog: no viable data found in field 1711.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF19accdntg: no viable data found in field 1593.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF19acaakoh: no viable data found in field 1831.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF19abzrhgq: no viable data found in field 1347.0 of filter ZTF_i. Proceeding to next field for this filter.
ZTF19abxwb