# Developing new functionality for determing file type
### Jameson Carter, 07/10/2024
Requirement: The current implementation of read_feh_data_file() uses an argument sample to determine whether a file it reads is a starting sample or an output sample. This is unnecessary because the type of a sample can be determined from a header file like it is done in the R implementation of this functionality.

### Get data with read functions

In [17]:
# Get most current read and save functions
%run ../feh_io/read_feh.py
%run ../feh_io/save_feh.py
import io
import struct
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

out_header_file = '../data/output/run-1006-baseline/base-v8/dynasipp_header_even.dat'
out_person_file = '../data/output/run-1006-baseline/base-v8/dynasipp_person_even.dat'
out_family_file = '../data/output/run-1006-baseline/base-v8/dynasipp_family_even.dat'

in_header_file = '../data/starting-sample/v2/dynasipp_HEADER.dat'
in_person_file = '../data/starting-sample/v2/dynasipp_PERSON.dat'
in_family_file = '../data/starting-sample/v2/dynasipp_FAMILY.dat'

In [18]:
# Read 10 records from each person and family file
perdata_out = read_feh_data_file(out_header_file, out_person_file, file_type='person', count=-1)
famdata_out = read_feh_data_file(out_header_file, out_family_file, file_type='family', count=10)
header_out = read_header_file(out_header_file)

perdata_in = read_feh_data_file(in_header_file, in_person_file, file_type='person', count=10)
famdata_in = read_feh_data_file(in_header_file, in_family_file, file_type='family', count=10)
header_in = read_header_file(in_header_file)

In [19]:
save_feh_parquet(perdata_out, '../data/output/run-1006-baseline/base-v8/', 'person_out')

Saved data as parquet to ../data/output/run-1006-baseline/base-v8/person_out.parquet


In [20]:
perdata_out = read_feh_data_file(out_header_file, out_person_file, file_type='person', count=-1)

In [21]:
table_test = read_parquet_1('../data/output/run-1006-baseline/base-v8/person_out.parquet')

In [22]:
table_test_2 = read_parquet_2('../data/output/run-1006-baseline/base-v8/person_out.parquet')

In [23]:
table_test_3 = pq.read_table('../data/output/run-1006-baseline/base-v8/person_out.parquet')

### Redefine functions to get variable selection

In [62]:
# Read 10 records out from each person file with a selected number of variables
# problem: the output vars are not the same as the input vars sometimes...
perdata_out = read_feh_data_file(out_header_file, out_person_file, vars = None,
                                 file_type='person', count=100)

In [63]:
perdata_out

array([(2, 1014, 14, 0, 0, 0, 258219, -1878449824, 67, 1, 2, 1,    0, 3, 1,    22, 18, 0, 0, 1,   3331,  -5185,    60,     0, 166,    897, -2933,   36,   36, 0,  11408,    0, -19409,  1093883, 0, 0, 9999,  0,   0,      0, 0,  2097, 0,  0,  0,  0, 0, 2, 0, 10,  0, 0, 0,  0,    74, 1014, 0, 2060,   1487, 0, 76, 0,  0, 1, 0, 0, 2060, 4, 0, 100, 0, 0,     0,   1703,   -192,     27,   4042,  -4549,     0,   0,   0, 0, 0, 1, 261955, 1993, 0, 9999,    92,    92,      0, 0, 1, 261955,     0,      0, 0,  0,  0, 0, 1, 0, 0,  1781, 0, 0, 0, 8, 3, 12, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [66]:
perdata_out = read_feh_data_file(out_header_file, out_person_file, vars = ['SEGTYPE', 'ETHNCTY'],
                                 file_type='person', count=10)

In [67]:
perdata_out

array([(2, 1014), (2, 4000), (2, 4000), (2, 1014), (2, 1014), (2, 1014),
       (2, 2000), (2, 2000), (2, 1000), (2, 1000)],
      dtype=[('SEGTYPE', '<i4'), ('ETHNCTY', '<i4')])

In [68]:
perdata_in = read_feh_data_file(in_header_file, in_person_file, file_type='person', count=10)


In [69]:
perdata_in

array([(2, 1000, 12, 0, 1,  0,      1,  1000906092, 26, 1, 1, 1, 0, 4, 0,  0,  0, 0, 0, 1, -9034, 13767, 15746,    0,  570,  2358,  3004, 2760, 2760, 0,   2768, 0,  4358,  1894602, 0, 0, 9999, 0,   0,  0, 0,     0,  4, 11, 11, 2, 0, 4, 0,  5, 11, 0, 0,  44,     1, 1024, 0,    0, 94200, 4, 22, 0, 0, 1,    0, 0,    0, 3, 0, 0, 0, 4,  2706, -11665,  -5188,  1052,  2141,   1857,      0,  0, 0, 0, 0, 0, 1, 1980,    0, 9999, 4993, 4993,     434832, 0, 2, 1,  2706,  1052, 0, 0, 0, 0,  4, 4,          0, 0, 0, 0, 0, 0, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,   0,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,     0,     0,     0,     0,     0,     0,    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,   