In [None]:
import os
import pandas
import math
import itertools
import numpy
import torch

#folder = "data"
#file_type = ".tbl"
#parse_folder("drive/MyDrive/ECE324/data", ".tbl")

def parse_folder(folder_name, suffix):
  filenames = os.listdir(folder_name)
  full_filenames = [os.path.join(folder_name, filename) for filename in filenames if filename.endswith(suffix)]
  print("Found", len(full_filenames), "files.\n")

  max_series_length = 0
  max_num_series = 0

  data_dict = dict()

  for file in full_filenames:
    whole_file = open(file, "r")
    star_id = whole_file.readline().strip()
    while((not (star_id.startswith("\\STAR_ID"))) or (star_id.startswith("\\STAR_ID_DEFINED"))):
      star_id = whole_file.readline().strip()
      if not star_id:
        print("Star ID not found for file", file)
        break
    
    line_contents = star_id.split("=", 1)
    star_name = (line_contents[-1]).strip().replace("\'", "").replace("\"", "")

    time_series = pandas.read_table(file, comment="\\", header=None, delim_whitespace=True, skipinitialspace=True)

    time_series = time_series.loc[pandas.isna(time_series[0]) != True]
    time_series = time_series.loc[pandas.isna(time_series[1]) != True]
    for row_label in time_series.index:
      if(pandas.isna(time_series.at[row_label, 2])):
        time_series.at[row_label, 2] = 0
    
    max_series_length = max(max_series_length, len(time_series))
    series_data = [(time_series[i]).tolist() for i in range(3)]

    if star_name in data_dict:
      (data_dict[star_name]).append(series_data)
    else:
      data_dict[star_name] = [series_data]
    max_num_series = max(max_num_series, len(data_dict[star_name]))

  stars_list = list()
  series_masks = dict()
  batched_series = [list() for batch in range(max_num_series)]

  for star in data_dict:
    stars_list.append(star)
    series_masks[star] = ([False]*len(data_dict[star]))+([True]*(max_num_series-len(data_dict[star])))

    for data_series in data_dict[star]:
      for data_axis in range(3):
        extend_factor = math.ceil(max_series_length/len(data_series[data_axis]))
        data_series[data_axis] = ((data_series[data_axis])*extend_factor)[:max_series_length]
        
    for data_series, into_batch in zip(itertools.cycle(data_dict[star]), range(max_num_series)):
      batched_series[into_batch].append(numpy.asarray(data_series, dtype=numpy.double))

  batches_list = [torch.from_numpy(numpy.asarray(batch, dtype=numpy.double)) for batch in batched_series]
    
  print("Longest data series:", max_series_length)
  return stars_list, series_masks, batches_list

Found 1071 files.

Longest data series: 678


(['HD 142',
  'HD 4308',
  'HD 4208',
  'HD 3651',
  '54 Psc',
  'HD 2039',
  'HD 2638',
  'HD 37606',
  'BD -17 0063',
  'HD 2952',
  '2MASS J00203151+3058293',
  'GJ 3021',
  'HD 166',
  'HD 1461',
  'HD 10697',
  'HD 11506',
  'HD 11964',
  'HD 10780',
  'HD 12661',
  'HD 10700',
  'HD 10476',
  'HD 10647',
  'HD 9826',
  'ups And',
  'Ups And',
  'HR 458',
  '2MASS J01343317+6856535',
  'HD 8574',
  'HD 6434',
  'HD 5608',
  'HD 7924',
  'HD 5319',
  'HD 4614',
  'HD 4203',
  'HD 4628',
  'HD 4313',
  'HD 4732',
  'HD 22781',
  'HD 22484',
  'HD 23079',
  'HD 22049',
  'HD 23127',
  'Epsilon Eridani',
  'HD 20367',
  'HD 19994',
  'HD 20630',
  'HD 20782',
  'HD 20868',
  'HIP 14810',
  'HD 19373',
  'HD 17156',
  'Iota Hor',
  'HD 17925',
  'HIP 12961',
  'Iota Horologii',
  'HD 16760',
  '81 Cet',
  'HD 17051',
  'HD 16175',
  '30 Ari B',
  'HD 16417',
  'Gl 86',
  '75 Cet',
  'HD 16160',
  'WASP-33',
  'HD 16141',
  '79 Ceti',
  'GJ 86',
  'alpha Ari',
  'HD 38529',
  'HD 37124'