# TOTEMS - Tidal Orbital decay Timing Extrapolation & Modelling Software

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

#Import curve fitting (Which is what we need for tidal decay)
from scipy.optimize import curve_fit

#Import pylightcurve, used for BJD HJD conversions - thanks to Angelos Tsiaras
import pylightcurve as plc

In [3]:
%matplotlib notebook
# if this isn't in a separate cell, sometimes doesn't work right
# magic commands are weird, to say the least.

# TODO - use structured or recorded arrays with boolean masks, rather than repeatedly iterating

First function: gets the relevant data for the exoplanet from the [http://var2.astro.cz/ETD/](Exoplanet Transit
Database). Use this if you don't
have any archive data of your own to use, though really any data from a recent paper will probably be better.

In [15]:
def get_etd_data(right_ascension, declination, url):
    """Imports the data from ETD. Uses RA & Dec to convert HJD_UTC to BJD_TDB.
    
    Args:
        right_ascension: the right ascension in hh mm ss.ss
        declination: the declination in dd mm ss.ss
        url: the URL of the ETD .csv datafile

    Returns:
        data: a Pandas DataFrame with no, bjd, mid_err, epoch, dq. These are the
          number (on ETD's list), the mid-time in BJD_TDB, the uncertainty of
          the mid-time, the epoch number based on the t_0 and period on ETD, and
          the ETD Data Quality factor, where 1 is best and 5 is worst.
        p_0: the period at epoch=0
        t_0: the mid-time at epoch=0
    """

    ### P_0 AND t_0 ###

    # Load in the row, split in two via delimeter choice.
    p_str, t_str = np.loadtxt(url, dtype=str, encoding='unicode_escape', delimiter=', ', skiprows=2, max_rows=1)
    
    # Now get just the number, lose everything else.
    for possibility in p_str.split():
        try:
            str(float(possibility))
            p_0 = possibility
        except ValueError:
            pass # if its not the number, move on
    for possibility in t_str.split():
        try:
            str(float(possibility))
            t_0 = possibility
        except ValueError:
            pass # if its not the number, move on

    ### TRANSIT DATA ###

    # Columns that we want
    csv_headers = ['#', 'HJDmid', 'HJDmid Error', 'Epoch', 'DQ']

    # What we want to name the columns.
    cols = ['tmid', 'err', 'epoch', 'DQ']

    # Now to import the actual transit data.
    data = pd.read_csv(url, delimiter=';', header=0, index_col=0,
                       usecols=csv_headers, skiprows=4)
    
    # Set the columns to the names we want
    data.index.rename('no', inplace=True)  # row header
    data.columns = cols  #columns

    # Convert from the truncated HJD to the full HJD
    data['tmid'] = data['tmid'].apply(lambda x: x + 2400000)

    # Assuming (fairly certain) ETD uses HJD_UTC, let's convert to BJD_TDB.
    # Of course, it might be worth checking that each individual data point
    # is actually in HJD, rather than e.g. BJD already, as ETD doesn't check it.
    ra_dec_string = right_ascension+" "+declination
    ra, dec = plc.ra_dec_string_to_deg(ra_dec_string)

    # Convert to BJD_TDB. N.B. we assume uncertainty remains the same.
    bjd = np.array([])
    for (col, value) in data['tmid'].iteritems():
        bjd = np.append(bjd, plc.hjd_utc_to_bjd_tdb(ra, dec, value))
    
    data['tmid'] = bjd
    
    return data, p_0, t_0

mydf, p_0, t_0 = get_etd_data('06 30 32.79', '+29 40 20.26', 'http://var2.astro.cz/ETD/ascii-etd.php?id=246&STARNAME=WASP-12&PLANET=b&PER=1.0914222&EPOCH=2454508.97605')

mydf



Unnamed: 0_level_0,tmid,err,epoch,DQ
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
262,2.459148e+06,0.00073,4250,3
261,2.459089e+06,0.00136,4196,2
260,2.458930e+06,0.00052,4051,1
259,2.458906e+06,0.00060,4029,2
258,2.458895e+06,0.00263,4019,3
...,...,...,...,...
5,2.454841e+06,0.00047,304,1
4,2.454837e+06,0.00130,301,4
3,2.454837e+06,0.00156,301,3
2,2.454836e+06,0.00060,300,2


In [None]:
def filter_etd_data(no, bjd, mid_err, epoch, dq, filter_dq):
    """Filters ETD data. Checks uncertainty exists. Returns only datapoints with DQ equal or better than specified
    value. N.B. DQ 1 is best, 5 is worst.
    Inputs:
    - no
    - bjd
    - mid_err
    - epoch
    - dq
    - filter_dq
    Outputs:
    - filteredData, the filtered data array (of arrays). Contains three arrays: BJD mid-time, uncertainty, and epoch.
    """

    # Let's filter for "good" data. I don't trust data without a tmid uncertainty, so check it exists/isn't 0
    good_data=[ [],[],[],[],[] ]
    for i,err in enumerate(mid_err):
        if err < 0:
            good_data[0].append(no[i])
            good_data[1].append(bjd[i])
            good_data[2].append(mid_err[i])
            good_data[3].append(epoch[i])
            good_data[4].append(dq[i])

    # the plan: one array, contains three arrays - these three arrays are for mid, mid_err, epoch
    # we don't bother preserving no., it was just imported to be used basically as a debugging tool
    # and dq is pointless once we've filtered. This also makes the addition of non-ETD data far easier
    filtered_data = [[],[],[]]

    for i,good_dq in enumerate(good_data[4]): # for each entry in the data
        if good_dq <= filter_dq: # check the DQ vs the specified DQ argument. If better...

            # ...then add the data to each of the three arrays in filteredData that we care about
            filtered_data[0].append(good_data[1][i])
            filtered_data[1].append(good_data[2][i])
            filtered_data[2].append(good_data[3][i])

    return filtered_data # return the data array containing only the transits that's been filtered by DQ

## Statistics functions

These functions are for the $\chi^2$ comparison. I sincerely doubt that I haven't accidentally reinvented the wheel - functions for this probably already exist. However, it's so simple, I've not exactly wasted hours on these.

Equations are from "Measurements and Their Uncertainties: A Practical Guide to Modern Error Analysis: Hughes and
Hase 2010".

$$ \chi^2 = \sum_i{\frac{y_i-y(x_i)}{\alpha_i^2}}$$

$\nu$ is the degrees of freedom: the number of datapoints minus the number of fitted parameters. We divide $\chi^2$ by
 $\nu$ to obtain the reduced chi-squared, as follows:

$$ \chi^2_\text{reduced} = \frac{\chi^2}{\nu} $$

In [None]:
def chi_squared(y_i,yx,alpha):
    """Calculates the unreduced chi squared.
    Inputs:
    - y_i, the array of observed y value (the y_i in the formula)
    - yx, the array of y values of the fitted line (the y(x) in the formula )
    - alpha, array of error in y, in the same units as y
    Outputs:
    - chi2, the unreduced chi squared value
    """

    chi2=0
    for i,y in enumerate(y_i):
        chi2 += ((y-yx[i])**2) / (alpha[i]**2)
    return chi2

def reduced_chi_squared(y,yx,alpha,m):
    """Calculates the reduced chi squared from the raw data: just chisq divided by degrees of freedom
    Degree of freedom is just number of observations n - number of fitted parameters m
    where n is just the number of y (or y(x) or x) values
    Inputs:
    - y, array of actual observed y value (the y_i in the formula)
    - yx, array of y value of the fitted line (the y(x) in the formula )
    - alpha, array of error in y, in the same units as y
    - m, the number of fitted parameters.
    Outputs:
    - chi2, the unreduced chi squared value
    """
    n = len(y) # number of observations/datapoints
    dof = n-m # degree of freedom, m is fitted params
    chi2 = chi_squared(y,yx,alpha) # get the unreduced chi squared
    reduced_chi2 = chi2/dof # reduce it
    return reduced_chi2

In [None]:
# TODO stop using globals. Consider OO principles, or structured/recorded arrays.
# TODO error handling and exceptions need to be added, because this too is currently a mess.

def  add_own_data(filename):
    """Function to add archive data and your own data. Put it into three columns: tmid, error, and type of JD (Either
     "HJD" or "BJD", assuming HJD_UTC or BJD_TDB. Will be asked for the P_0: supply your own or use ETDs. For t_0,
     assumes earliest tmid given. NOTE: if ETD data has been used, it will force the ETD values.
    Inputs:
    - filename, the name/path of your data table (just use a csv)
    Outputs:
    - bjd, array of BJD_TDB mid-transit-times
    - mid_err, the uncertainty of those tmids
    - epoch, transits since t_0
    - t_0, midtime at epoch=0
    - p_0, the period at epoch=0
    """
    # TODO don't do this
    global url
    global ra
    global dec

    # import all the data
    mid,mid_err,jd_type = np.genfromtxt(filename, encoding='unicode_escape', delimiter=',', missing_values='',
                                            filling_values=-1, unpack=True)

    bjd=[]
    # check if we need to convert to BJD
    # TODO: error handling needs implementing here
    for i,tmid in enumerate(mid):
        if jd_type[i] == "BJD":
            bjd.append(tmid)
        elif jd_type[i] == "HJD":
            bjd.append(plc.hjd_utc_to_bjd_tdb(ra, dec, tmid))
        else:
            print("Something has gone wrong! Your type value needs to be a string of BJD or HJD.")
            # TODO: better way to deal with this, proper exception handling
            break

    # check if we need to assume a t_0 and p_0, if so, do so
    t_0 = 0
    p_0 = 0
    if not etd_used:
        # use the earliest tmid as t_0
        t_0 = np.amin(bjd)

        # ask for p_0, check it's sensible
        p_good = False
        while not p_good:
            p_0 = input("Please input the value of P, in days:")
            if p_0 > 0:
                p_good = True
                # TODO: a better check than this!
    else:
        # url should be defined already outside this function, so can just call it because we're bad people that
        # aren't obeying proper OO principles
        # TODO: obey proper OO principles
        p_0, t_0 = p0_t0_from_etd(url)

    epoch=[]

    # calculate the epoch system - assume closest rounding is correct via np.rint
    for i,tmid in enumerate(bjd):
        epoch[i] = np.rint((tmid-t_0)/p_0)

    #return everything: again, three arrays, two floats
    return bjd, mid_err, epoch, p_0, t_0

In [None]:
def sort_etd_own_data(etd_data, own_data):
    """Sorts the filtered ETD data and user's archive data into the correct order.
    Inputs:
    -etd_data, an array of ETD data
    -own_data, archive data added from literature, may contain user's own datapoint
    Outputs:
    -sorted_data, the sorted combination of these two datasets
    """
    own_bjd, own_mid_err, own_epoch = own_data
    etd_bjd, etd_mid_err, etd_epoch = etd_data
    for j in tqdm(range(0, len(own_epoch))):
        for i in range(0, len(etd_epoch)):
            if (own_epoch[j] >= etd_epoch[i]):
                etd_epoch = np.insert(etd_epoch,i,own_epoch[j])
                etd_bjd = np.insert(etd_bjd,i,own_bjd[j])
                etd_mid_err = np.insert(etd_mid_err,i,own_mid_err[j])
                break
    # TODO: inefficient, better to add to new array than constantly cycling ETD array?

    return [etd_epoch, etd_bjd, etd_mid_err]