Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality for MR metadata reading from SAV #313

Open
wants to merge 35 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b96798d
Add functionality for MR metadata reading from SAV
slobodan-ilic Apr 24, 2024
850f0df
Try fixing build
slobodan-ilic May 5, 2024
bae8721
Fix issues with null-termination of mr string
slobodan-ilic Jun 3, 2024
55af2f2
Refactor of mr parsing
slobodan-ilic Jun 4, 2024
e471605
Try fixing fuzzifier
slobodan-ilic Jun 4, 2024
789511a
wip
slobodan-ilic Jun 4, 2024
622301c
fixup! Try fixing fuzzifier
slobodan-ilic Jun 4, 2024
8b453bd
fixup! wip
slobodan-ilic Jun 4, 2024
0a83ade
fixup! fixup! wip
slobodan-ilic Jun 4, 2024
26e96c7
Fix error found by fuzzifier
slobodan-ilic Jun 12, 2024
481a7d1
Fix another malloc issue found with fuzzer
slobodan-ilic Jun 13, 2024
ec778f5
Another malloc fix
slobodan-ilic Jun 13, 2024
d30f048
try fix oom found with fuzzer
slobodan-ilic Jun 13, 2024
7211183
free memory
slobodan-ilic Jun 14, 2024
7cc0ef8
Fail early on bad MR string
slobodan-ilic Jun 14, 2024
41a7ac4
Fix Win build
slobodan-ilic Jun 14, 2024
04c3d33
Test fuzzer with freeing memory
slobodan-ilic Jun 14, 2024
a8f252a
Try debug fuzzer on CI (amend)
slobodan-ilic Jun 14, 2024
a1a69bc
Test Fuzzer hatchet style
slobodan-ilic Jun 15, 2024
1b0b133
Fix accidental delete
slobodan-ilic Jun 15, 2024
1c78b3c
Un-hatchet after successful fuzz run
slobodan-ilic Jun 15, 2024
fac517b
Un-hatchet pt2
slobodan-ilic Jun 15, 2024
0a76076
Un-hatchet pt3
slobodan-ilic Jun 15, 2024
213a76a
Fix actual logic
slobodan-ilic Jun 15, 2024
68b2ecb
Rewrite parsing logic with Ragel
slobodan-ilic Jun 20, 2024
12fa4b2
try fixing appveyor build
slobodan-ilic Jun 20, 2024
0a11d5c
Try fix build pt2
slobodan-ilic Jun 20, 2024
8975ade
Try fix build pt3
slobodan-ilic Jun 20, 2024
1c92bd2
Fix attempt pt 4
slobodan-ilic Jun 20, 2024
db6164e
Try fix build pt5
slobodan-ilic Jun 20, 2024
07e323f
Fix build pt6
slobodan-ilic Jun 20, 2024
b0a99ef
Fix functionality
slobodan-ilic Jun 21, 2024
0fbca90
try fix build
slobodan-ilic Jun 21, 2024
fc836e7
Try fix build
slobodan-ilic Jun 21, 2024
6f500cb
Change parser to full-ragel
slobodan-ilic Jun 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/readstat.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ typedef enum readstat_error_e {

const char *readstat_error_message(readstat_error_t error_code);

typedef struct mr_set_s {
char type;
char *name;
char *label;
int is_dichotomy;
int counted_value;
char **subvariables;
int num_subvars;
} mr_set_t;

typedef struct readstat_metadata_s {
int64_t row_count;
int64_t var_count;
Expand All @@ -121,6 +131,8 @@ typedef struct readstat_metadata_s {
const char *file_label;
const char *file_encoding;
unsigned int is64bit:1;
size_t multiple_response_sets_length;
mr_set_t *mr_sets;
} readstat_metadata_t;

/* If the row count is unknown (e.g. it's an XPORT or POR file, or an SAV
Expand All @@ -138,6 +150,8 @@ readstat_endian_t readstat_get_endianness(readstat_metadata_t *metadata);
const char *readstat_get_table_name(readstat_metadata_t *metadata);
const char *readstat_get_file_label(readstat_metadata_t *metadata);
const char *readstat_get_file_encoding(readstat_metadata_t *metadata);
const mr_set_t *readstat_get_mr_sets(readstat_metadata_t *metadata);
size_t readstat_get_multiple_response_sets_length(readstat_metadata_t *metadata);

typedef struct readstat_value_s {
union {
Expand Down
8 changes: 8 additions & 0 deletions src/readstat_metadata.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,11 @@ const char *readstat_get_file_encoding(readstat_metadata_t *metadata) {
const char *readstat_get_table_name(readstat_metadata_t *metadata) {
return metadata->table_name;
}

size_t readstat_get_multiple_response_sets_length(readstat_metadata_t *metadata) {
return metadata->multiple_response_sets_length;
}

const mr_set_t *readstat_get_mr_sets(readstat_metadata_t *metadata) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function should be called readstat_get_multiple_response_sets

return metadata->mr_sets;
}
5 changes: 5 additions & 0 deletions src/spss/readstat_sav.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "readstat_spss.h"
#include "../readstat.h"

#pragma pack(push, 1)

Expand Down Expand Up @@ -100,6 +101,9 @@ typedef struct sav_ctx_s {
uint64_t lowest_double;
uint64_t highest_double;

size_t multiple_response_sets_length;
mr_set_t *mr_sets;

double bias;
int format_version;

Expand All @@ -117,6 +121,7 @@ typedef struct sav_ctx_s {

#define SAV_RECORD_SUBTYPE_INTEGER_INFO 3
#define SAV_RECORD_SUBTYPE_FP_INFO 4
#define SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS 7
#define SAV_RECORD_SUBTYPE_PRODUCT_INFO 10
#define SAV_RECORD_SUBTYPE_VAR_DISPLAY 11
#define SAV_RECORD_SUBTYPE_LONG_VAR_NAME 13
Expand Down
219 changes: 219 additions & 0 deletions src/spss/readstat_sav_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "../readstat_iconv.h"
#include "../readstat_convert.h"
#include "../readstat_malloc.h"
#include "../CKHashTable.h"

#include "readstat_sav.h"
#include "readstat_sav_compress.h"
Expand Down Expand Up @@ -145,6 +146,187 @@ static readstat_error_t sav_parse_variable_display_parameter_record(sav_ctx_t *c
static readstat_error_t sav_parse_machine_integer_info_record(const void *data, size_t data_len, sav_ctx_t *ctx);
static readstat_error_t sav_parse_long_string_value_labels_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx);
static readstat_error_t sav_parse_long_string_missing_values_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx);
static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx_t *ctx);

static mr_set_t parse_mr_line(const char *line) {
slobodan-ilic marked this conversation as resolved.
Show resolved Hide resolved
const char *equals_pos = strchr(line, '=');
mr_set_t result;

if (equals_pos != NULL && equals_pos[1] != '\0') {
result.type = equals_pos[1];
int name_length = equals_pos - line;
result.name = malloc(name_length + 1);
strncpy(result.name, line, name_length);
result.name[name_length] = '\0';
const char *next_part = equals_pos + 2; // Start after the '=' and type character
if (result.type == 'D') {
result.is_dichotomy = 1;
const char *digit_start = next_part;
while (*next_part != ' ' && *next_part != '\0') {
next_part++;
}
int internal_count = (int)strtol(digit_start, NULL, 10);
if (*next_part == ' ') {
next_part++;
} else {
fprintf(stderr, "Expected a space after the internal count\n");
return result;
}
digit_start = next_part;
for (int i = 0; i < internal_count && isdigit(*next_part); i++) {
next_part++;
}
result.counted_value = (int)strtol(digit_start, NULL, 10);
printf("\nFinal counted value is: %d\n", result.counted_value);
if (*next_part != ' ' && *next_part != '\0') {
fprintf(stderr, "Expected a space or end of string after the counted value\n");
return result;
}
}
else if (result.type == 'C') {
result.is_dichotomy = 0;
result.counted_value = -1;
}
if (*next_part != ' ') {
fprintf(stderr, "Expected a space after type 'C'\n");
free(result.name);
result.name = NULL;
return result;
}
next_part++;
const char *digit_start = next_part;
while (isdigit(*next_part)) {
next_part++;
}
if (*next_part != ' ') {
fprintf(stderr, "Expected a space after the digits\n");
free(result.name);
result.name = NULL;
return result;
}
size_t count = strtoul(digit_start, NULL, 10);
next_part++; // Move past the space after the digits
printf("count: %zu\n", count);
if (strlen(next_part) < count) {
fprintf(stderr, "Not enough characters available to read the specified count\n");
free(result.name);
result.name = NULL;
return result;
}

// Allocate memory for label
result.label = malloc(count + 1); // +1 for the null-terminator
if (result.label == NULL) {
fprintf(stderr, "Failed to allocate memory for label\n");
free(result.name);
result.name = NULL;
return result;
}

// Copy the specified number of characters into label
strncpy(result.label, next_part, count);
result.label[count] = '\0'; // Null-terminate the string

// Move the next_part pointer past the read characters
next_part += count;

// Output the actual label for debugging
printf("label: %s\n", result.label);

if (*next_part != ' ') {
fprintf(stderr, "Expected a space after the label\n");
free(result.label);
result.label = NULL;
return result;
}
next_part++; // Move past the space
char **subvariables = NULL;
int subvar_count = 0;
while (*next_part) {
if (*next_part == ' ') { // Skip any extra spaces
next_part++;
continue;
}

const char *start = next_part;
while (*next_part && *next_part != ' ') {
next_part++; // Move to the end of the current subvariable
}

size_t length = next_part - start;
char *subvariable = malloc(length + 1); // Allocate memory for the subvariable
if (subvariable == NULL) {
fprintf(stderr, "Failed to allocate memory for a subvariable\n");
// Cleanup previously allocated subvariables
for (int i = 0; i < subvar_count; i++) {
free(subvariables[i]);
}
free(subvariables);
free(result.label);
result.label = NULL;
return result;
}
strncpy(subvariable, start, length);
subvariable[length] = '\0'; // Null-terminate the string

// Allocate/resize the subvariables array
char **temp = realloc(subvariables, (subvar_count + 1) * sizeof(char *));
if (temp == NULL) {
fprintf(stderr, "Failed to allocate memory for subvariables array\n");
free(subvariable);
// Cleanup previously allocated subvariables
for (int i = 0; i < subvar_count; i++) {
free(subvariables[i]);
}
free(subvariables);
free(result.label);
result.label = NULL;
return result;
}
subvariables = temp;
subvariables[subvar_count++] = subvariable; // Add the new subvariable to the array

if (*next_part == ' ') {
next_part++; // Move past the space
}
}

result.subvariables = subvariables;
result.num_subvars = subvar_count;

} else {
result.type = '\0'; // Use a default type or an error indicator
result.name = NULL;
}

return result;
}

static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx_t *ctx) {
readstat_error_t retval = READSTAT_OK;

char *mr_string = readstat_malloc(data_len);
if (mr_string == NULL) return READSTAT_ERROR_MALLOC;

if (ctx->io->read(mr_string, data_len, ctx->io->io_ctx) < data_len) {
retval = READSTAT_ERROR_PARSE;
free(mr_string);
mr_string = NULL;
return retval;
}

char *token = strtok(mr_string, "$\n");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe strtok is not thread-safe; I'd prefer a thread-safe implementation.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have any particular advice for this? I tried with strtok_r and strtok_s combination, but build system is affected... I guess one option would be to do the entire mr_string parser in ragel... But maybe there's a quicker way to do it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the build error?

int num_lines = 0;
while (token != NULL) {
ctx->mr_sets = realloc(ctx->mr_sets, (num_lines + 1) * sizeof(mr_set_t *));
ctx->mr_sets[num_lines] = parse_mr_line(token);
num_lines++;
token = strtok(NULL, "$\n");
}
ctx->multiple_response_sets_length = num_lines;

return retval;
}

static void sav_tag_missing_double(readstat_value_t *value, sav_ctx_t *ctx) {
double fp_value = value->v.double_value;
Expand Down Expand Up @@ -1339,6 +1521,10 @@ static readstat_error_t sav_parse_records_pass1(sav_ctx_t *ctx) {
retval = sav_parse_machine_integer_info_record(data_buf, data_len, ctx);
if (retval != READSTAT_OK)
goto cleanup;
} else if (subtype == SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS) {
retval = sav_read_multiple_response_sets(data_len, ctx);
if (retval != READSTAT_OK)
goto cleanup;
} else {
if (io->seek(data_len, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
retval = READSTAT_ERROR_SEEK;
Expand Down Expand Up @@ -1665,6 +1851,8 @@ readstat_error_t readstat_parse_sav(readstat_parser_t *parser, const char *path,
goto cleanup;

metadata.file_label = ctx->file_label;
metadata.multiple_response_sets_length = ctx->multiple_response_sets_length;
metadata.mr_sets = ctx->mr_sets;

if (ctx->handle.metadata(&metadata, ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
Expand All @@ -1678,6 +1866,37 @@ readstat_error_t readstat_parse_sav(readstat_parser_t *parser, const char *path,
if ((retval = sav_handle_variables(ctx)) != READSTAT_OK)
goto cleanup;

ck_hash_table_t *var_dict = ck_hash_table_init(1024, 8);
for (size_t i = 0; i < ctx->varinfo_capacity; i++) {
spss_varinfo_t *current_varinfo = ctx->varinfo[i];
if (current_varinfo != NULL) {
ck_str_hash_insert(current_varinfo->name, current_varinfo, var_dict);
}
}
for (size_t i = 0; i < ctx->multiple_response_sets_length; i++) {
mr_set_t mr = ctx->mr_sets[i];
for (size_t j = 0; j < mr.num_subvars; j++) {
if (mr.type == 'C') {
char* sv_name_upper = malloc(strlen(mr.subvariables[i]) + 1);
for (int c = 0; mr.subvariables[j][c] != '\0'; c++) {
sv_name_upper[c] = toupper((unsigned char) mr.subvariables[j][c]);
}
sv_name_upper[strlen(mr.subvariables[j])] = '\0';
spss_varinfo_t *info = (spss_varinfo_t *)ck_str_hash_lookup(sv_name_upper, var_dict);
if (info) {
free(mr.subvariables[j]);
mr.subvariables[j] = malloc(strlen(info->longname) + 1);
if (mr.subvariables[j] == NULL) {
continue;
}
strcpy(mr.subvariables[j], info->longname);
}
}
}
}
if (var_dict)
ck_hash_table_free(var_dict);

if ((retval = sav_handle_fweight(ctx)) != READSTAT_OK)
goto cleanup;

Expand Down
Loading