diff --git a/CITATION.cff b/CITATION.cff index 381ba9f..3b9e84a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,7 +5,7 @@ authors: given-names: "Otto" orcid: "https://orcid.org/0000-0002-3363-9287" title: "Pyreadstat" -version: 1.2.0 +version: 1.2.1 doi: 10.5281/zenodo.6612282 date-released: 2018-09-24 url: "https://github.com/Roche/pyreadstat" diff --git a/README.md b/README.md index f0050a0..9829335 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ the original applications in this regard.** - [Missing Values](#missing-values) + [SPSS](#spss) + [SAS and STATA](#sas-and-stata) + - [Reading datetime and date columns](#reading-datetime-and-date-columns) - [Other options](#other-options) + [More writing options](#more-writing-options) - [File specific options](#file-specific-options) @@ -637,6 +638,36 @@ This is a list listing all user defined missing values. User defined missing values are currently not supported for file types other than sas7bdat, sas7bcat and dta. +#### Reading datetime and date columns + +SAS, SPSS and STATA represent datetime, date and other similar concepts as a numeric column and then applies a +display format on top. Roughly speaking, internally there are two possible representations: one for concepts with a day or lower +granularity (date, week, quarter, year, etc.) and those with a higher granularity than a day (datetime, time, hour, etc). +The first group is suceptible to be converted to a python date object and the second to a python datetime object. + +Pyreadstat attempts to read columns with datetime, date and time formats that are convertible +to python datetime, date and time objects automatically. However there are other formats that are not fully convertible to +any of these formats, for example SAS "YEAR" (displaying only the year), "MMYY" (displaying only month and year), etc. +Because there are too many of these formats and these keep changing, it is not possible to implement a rule for each of +those, therefore these columns are not transformed and the user will obtain a numeric column. + +In order to cope with this issue, there are two options for each reader function: extra\_datetime\_formats and + extra\_date\_formats that allow the user to +pass these datetime or date formats, to transform the numeric values into datetime or date python objects. Then, the user +can format those columns appropiately; for example extracting the year only to an integer column in the case of 'YEAR' or +formatting it to a string 'YYYY-MM' in the case of 'MMYY'. The choice between datetime or date format depends on the granularity +of the data as explained above. + +This arguments are also useful in the case you have a valid datetime, date or time format that is currently not recognized in pyreadstat. +In those cases, feel free to file an issue to ask those to be added to the list, in the meantime you can use these arguments to do +the conversion. + +```python +import pyreadstat + +df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', extra_date_formats=["YEAR", "MMYY"]) +``` + #### Other options You can set the encoding of the original file manually. The encoding must be a [iconv-compatible encoding](https://gist.github.com/hakre/4188459). diff --git a/change_log.md b/change_log.md index 50dcfb0..6a70c4c 100644 --- a/change_log.md +++ b/change_log.md @@ -1,3 +1,11 @@ +# 1.2.1 (github, pypi and conda 2023.02.22) +* Readstat source updated to version 1.1.9 +* introduced recognition for pandas datatype datetime64[ns, UTC] and other datetime64 types when writing, + so that this column type gets correctly written as datetime +* introduced extra_datetime_formats and extra_date_formats arguments for read functions, cleaned the list of + sas date, datetime and time formats to exclude those not directly convertible to python objects +* improved performace of writer when there are datetime64 columns + # 1.2.0 (github, pypi and conda 2022.10.25) * Fixed #206, #207 * added pyproject.toml diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 93f754a..61b86a2 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree index 3f1ef6c..a25b445 100644 Binary files a/docs/_build/doctrees/index.doctree and b/docs/_build/doctrees/index.doctree differ diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo index ca965f6..d41041b 100644 --- a/docs/_build/html/.buildinfo +++ b/docs/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: d1957ba96adbb9536e51c852822e9ccb +config: dc63e4405a0437fb9efe8c4f5ffb3848 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/_static/documentation_options.js b/docs/_build/html/_static/documentation_options.js index 5be98c4..120e3ee 100644 --- a/docs/_build/html/_static/documentation_options.js +++ b/docs/_build/html/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '1.2.0', + VERSION: '1.2.1', LANGUAGE: 'None', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html index 6382887..459099f 100644 --- a/docs/_build/html/genindex.html +++ b/docs/_build/html/genindex.html @@ -3,7 +3,7 @@ - Index — pyreadstat 1.2.0 documentation + Index — pyreadstat 1.2.1 documentation