### Run upon export from spreadsheet

In [4]:
import os

from astroquery.mast import Catalogs
import numpy as np
import pandas as pd


tces_file = '/mnt/tess/labels/s33_cam2ccd14_sample.csv'
ext_data_file = '/mnt/tess/labels/ext_mast_data.csv'
labels_file = '/mnt/tess/labels/labels_ext_mission_test.csv'
splits_file = '/mnt/tess/labels/splits_v3.csv'


tce_table = pd.read_csv(tces_file, header=0, low_memory=False)
tce_table['tic_id'] = tce_table['star_tic']
tce_table['Duration'] = tce_table['planet_tdur']
tce_table['Period'] = tce_table['planet_period']
tce_table['RA'] = tce_table['star_ra']
tce_table['Sectors'] = tce_table['sector_id'].apply(lambda v: len(v.split(' ')))
tce_table['Transit_Depth'] = tce_table['planet_depth']
tce_table['Dec'] = tce_table['star_dec']
tce_table['teff'] = tce_table['star_teff']
tce_table['SN'] = tce_table['snr']
tce_table['Qingress'] = 0.0
tce_table['Tmag'] = tce_table['star_tmag']
tce_table['logg'] = tce_table['star_logg']
tce_table['Epoc'] = tce_table['planet_epoch']
tce_table = tce_table.set_index('tic_id')
tce_table = tce_table.drop(columns=['Unnamed: 0'])
tce_table['Duration'] /= 24.0

# Drop some common invalid examples.
# Orbits falling inside the star
tce_table = tce_table[~tce_table.Ilabel]
# Excessively large durations
tce_table = tce_table[tce_table.Duration < 0.9 * tce_table.Period]

joined_table = tce_table

ext_table = pd.read_csv(ext_data_file, header=0, low_memory=False).set_index('tic_id')
joined_table = joined_table.join(ext_table, on='tic_id', how='left')

joined_table = joined_table[
    joined_table['objType'].isnull()
    | (joined_table['objType'] == 'STAR')
]

joined_table = joined_table.reset_index()[[
    'tic_id', 'RA', 'Dec', 'Tmag', 'Epoc', 'Period', 'Duration',
    'Transit_Depth', 'Sectors', 'star_rad', 'star_mass', 'teff',
    'logg', 'SN', 'Qingress'
]]


labels_table = pd.read_csv(labels_file, header=0, low_memory=False)
disps = ['E', 'J', 'N', 'S', 'B']
users = ['av', 'md', 'ch', 'as', 'mk', 'et']

for d in disps:
    labels_table[f'disp_{d}'] = 0

def set_labels(row):
    a = ~row.isna()
    if a['Final']:
        row[f'disp_{row["Final"]}'] = 1
    else:
        for user in users:
#             # Override md's votes when they're 1-to-all against J
#             if user == 'md' and row[user] in ('B', 'N'):
#                 others = [
#                     row[u] for u in users
#                     if row[u] and u != 'md' and not(isinstance(row[u], float) and np.isnan(row[u]))]
#                 if all(o == 'J' for o in others):
#                     row[f'disp_J'] += 1
#                     continue
            if a[user] and row[user] and row[user] != 'U':
                row[f'disp_{row[user]}'] += 1
                        
    return row

labels_table['tic_id'] = labels_table['TIC ID']
labels_table = labels_table.apply(set_labels, axis=1)

labels_table = labels_table[['tic_id'] + [f'disp_{d}' for d in disps]]


joined_table = joined_table.set_index('tic_id')
labels_table = labels_table.set_index('tic_id')
joined_table = joined_table.join(labels_table, on='tic_id', how='inner')
print(f'Total entries: {len(joined_table)}')
joined_table = joined_table[
    sum(joined_table[f'disp_{d}'] for d in disps) > 0
]
print(f'Total labeled entries: {len(joined_table)}')


all_table = joined_table
splits_table = pd.read_csv(splits_file, header=0, low_memory=False)
splits_table['tic_id'] = splits_table['TIC ID']
splits_table = splits_table.set_index('tic_id')
joined_table = joined_table.join(splits_table, on='tic_id', how='inner')

t_train = joined_table[joined_table['Split'] == 'train']
t_val = joined_table[joined_table['Split'] == 'val']
t_test = joined_table[joined_table['Split'] == 'test']
t_train = t_train.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
t_val = t_val.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
t_test = t_test.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
print(f'Split sizes. Train: {len(t_train)}; Valid: {len(t_val)}; Test: {len(t_test)}')


# t_train.to_csv('/mnt/tess/astronet/tces-v6-train.csv')
# t_val.to_csv('/mnt/tess/astronet/tces-v6-val.csv')
# t_test.to_csv('/mnt/tess/astronet/tces-v6-test.csv')
all_table.to_csv('/mnt/tess/astronet/tces-tmp-ext-all.csv')

Total entries: 590
Total labeled entries: 577
Split sizes. Train: 0; Valid: 0; Test: 0


In [None]:
pd.set_option('display.max_columns', None)
t_train.sample(5)

In [None]:
t_val.sample(5)

In [None]:
t_test.sample(5)

### Run once

In [None]:
def load_tces_old():
    tceold = pd.read_csv('/mnt/tess/astronet/tces.csv', header=0).set_index('tic_id')

    # Only keep the max sectors read.
    maxsect = tceold.groupby('tic_id')['Sectors'].max()
    tceold = tceold.join(maxsect, on='tic_id', how='right', rsuffix='_max')
    tceold = tceold[tceold.Sectors == tceold.Sectors_max]

    # Then keep the max row ID.
    maxrowid = tceold.groupby('tic_id')['row_id'].max()
    tceold = tceold.join(maxrowid, on='tic_id', how='right', rsuffix='_max')
    tceold = tceold[tceold.row_id == tceold.row_id_max]

    return tceold

def generate_tce_bls_instar():
    tcenew = pd.read_csv('/mnt/tess/labels/tce_bls_instar.csv', header=0).set_index('tic_id')
    tceold = load_tces_old()
    tcenorth = pd.read_csv('/mnt/tess/labels/tce_north_instar.csv', header=0).set_index('tic_id')

    # Copy from old data where it's missing from the new.
    alltce = tcenew.join(tceold, how='outer', on='tic_id', rsuffix='_old')
    alltce = alltce.set_index('tic_id')

    alltce = alltce.drop(columns=['row_id'])

    def fillna(df, col_name):
        df.loc[df[col_name].isna(), col_name] = df.loc[df[col_name].isna(), col_name + '_old']

    fillna(alltce, 'toi_id')
    fillna(alltce, 'Disposition')
    fillna(alltce, 'RA')
    fillna(alltce, 'Dec')
    fillna(alltce, 'Tmag')
    fillna(alltce, 'Epoc')
    fillna(alltce, 'Period')
    fillna(alltce, 'Duration')
    fillna(alltce, 'Transit_Depth')
    fillna(alltce, 'Sectors')
    fillna(alltce, 'camera')
    fillna(alltce, 'ccd')
    fillna(alltce, 'star_rad')
    fillna(alltce, 'star_mass')
    fillna(alltce, 'teff')
    fillna(alltce, 'logg')
    fillna(alltce, 'SN')
    fillna(alltce, 'Qingress')

    alltce = alltce.drop(columns=[c for c in alltce.columns if c.endswith('_old')])
    
    alltce = alltce.append(tcenorth)
    
    alltce['Ilabel'] = alltce['Ilabel'].fillna(False)

    alltce.to_csv('/mnt/tess/labels/tce_bls_instar+old.csv')