Skip to content

Commit

Permalink
Add dummy code method
Browse files Browse the repository at this point in the history
  • Loading branch information
Will-Tyler committed Feb 7, 2024
1 parent 7a418eb commit 53375bf
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 2 deletions.
1 change: 1 addition & 0 deletions hail/python/hail/docs/methods/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ identity by descent [1]_, KING [2]_, and PC-Relate [3]_.
maximal_independent_set
rename_duplicates
segment_intervals
dummy_code

.. [1] Purcell, Shaun et al. “PLINK: a tool set for whole-genome association and
population-based linkage analyses.” American journal of human genetics
Expand Down
3 changes: 2 additions & 1 deletion hail/python/hail/docs/methods/misc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ Miscellaneous
.. autofunction:: grep
.. autofunction:: maximal_independent_set
.. autofunction:: rename_duplicates
.. autofunction:: segment_intervals
.. autofunction:: segment_intervals
.. autofunction:: dummy_code
3 changes: 2 additions & 1 deletion hail/python/hail/methods/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
compute_charr,
vep_json_typ,
)
from .misc import rename_duplicates, maximal_independent_set, segment_intervals, filter_intervals
from .misc import rename_duplicates, maximal_independent_set, segment_intervals, filter_intervals, dummy_code
from .relatedness import identity_by_descent, king, pc_relate, simulate_random_mating

__all__ = [
Expand Down Expand Up @@ -133,6 +133,7 @@
'balding_nichols_model',
'ld_prune',
'filter_intervals',
'dummy_code',
'segment_intervals',
'de_novo',
'filter_alleles',
Expand Down
43 changes: 43 additions & 0 deletions hail/python/hail/methods/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,3 +490,46 @@ def segment_intervals(ht, points):
)
ht = ht.annotate(__new_intervals=interval_results, lower=lower, higher=higher).explode('__new_intervals')
return ht.key_by(**{list(ht.key)[0]: ht.__new_intervals}).drop('__new_intervals')


def dummy_code(matrix_table: MatrixTable, *column_field_names: str):
"""Dummy code categorical variables.
Examples
--------
>>> mt = hail.balding_nichols_model(1, 50, 100)
>>> smoking_categories = hail.literal(['current', 'former', 'never'])
>>> mt = mt.annotate_cols(
... smoking_status = smoking_categories[hail.rand_cat([1, 1, 1])],
... is_case = hail.rand_bool(0.5)
... )
>>> dummy_code(mt, "smoking_status")
Parameters
----------
matrix_table : :class:`.MatrixTable`
A matrix table.
*column_field_names : :obj:`str`
The names of the column fields in the matrix table to dummy code.
Returns
-------
:class:`.MatrixTable`
The matrix table with the dummy-coded variables.
:obj:`list`
A list of the names of the dummy-coded variables.
:obj:`dict`
A dictionary mapping the column field names to the categories of the column field.
"""
field_name_to_categories = {
field_name: matrix_table.aggregate_cols(hl.agg.collect_as_set(matrix_table[field_name]))
for field_name in column_field_names
}
dummy_variables = {
f'{field_name}__{category}': hl.int(matrix_table[field_name] == category)
for field_name in column_field_names
for category in field_name_to_categories[field_name]
}
matrix_table = matrix_table.annotate_cols(**dummy_variables)
dummy_variable_field_names = list(dummy_variables)
return matrix_table, dummy_variable_field_names, field_name_to_categories

0 comments on commit 53375bf

Please sign in to comment.