In [3]:
import numpy as np
import pandas as pd
import pytest
import random
import string


def get_data():
    np.random.seed(42)
    random.seed(34)
    ftime = np.random.exponential(size=200)
    fstatus = pd.Series(np.random.randint(0, 3, 200))
    x2 = np.random.exponential(size=200)
    x3 = np.random.randn(200)
    x1 = [random.choice(string.ascii_lowercase) for _ in range(200)]
    x4 = np.random.randint(0, 10, 200).astype(int)
    cov = pd.DataFrame(dict(x1=x1, x2=x2, x3=x3, x4=x4))
    return ftime, fstatus, cov

ftime, fstatus, cov = get_data()


In [4]:
import pandas as pd

from pandas.api.types import is_numeric_dtype


def to_categorical(df, columns):
    """ Creates a COPY and replace the text columns integer values (categories)

    Args:
        df: some data frame
        columns: list of TEXT columns to be interperted as categories

    Returns:
        a copy dataframe with TEXT columns replaced with intgers
    """
    _df = df.copy()
    for c in columns:
        _df.loc[:, c] = df[c].astype('category').cat.codes
    return _df


def non_numeric_columns(df):
    """ Returns a listof non numeric columns """
    return [c for c in df.columns if not is_numeric_dtype(df[c])]


def _to_dummies(df, column_name, base=None):
    _df = df.copy()
    _dummies = pd.get_dummies(df[column_name], prefix=column_name).astype('int64')
    if base is not None:
        _dummies = _dummies.drop('%s_%s' % (column_name, base), axis=1)
    _df = _df.drop(column_name, axis=1)
    return pd.concat([_df, _dummies], axis=1)


def as_indicators(df, column_names, bases=None):
    if bases is None:
        bases = [None] * len(column_names)
    _df = df.copy()
    for col, base in zip(column_names, bases):
        _df = _to_dummies(_df, col, base=base)
    return _df


In [14]:
!conda list | grep rpy2

rpy2                      3.4.5                    pypi_0    pypi


In [15]:
!pip install rpy2

Collecting rpy2
  Using cached rpy2-3.4.5-py3-none-any.whl
Installing collected packages: rpy2
Successfully installed rpy2-3.4.5


In [16]:
import enum

import numpy as np
import pandas as pd
import rpy2 as R
import rpy2.rinterface as rinterface

from rpy2 import robjects
from pandas.api.types import is_numeric_dtype
from rpy2.robjects import r, pandas2ri, numpy2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
numpy2ri.activate()


class NotImplementedError(Exception):
    pass


class InputError(Exception):
    pass


class Dtypes(enum.Enum):
    int = rinterface.RTYPES.INTSXP
    float = rinterface.RTYPES.REALSXP
    bool = rinterface.RTYPES.LGLSXP
    str = rinterface.RTYPES.STRSXP


# def r_vec(np_vec, dtype):
#     return rinterface.SexpVector(np_vec, dtype.value)

def r_vector(np_vector):
    """Convert a numpy vector to an R vector

    Args:
        np_vector (np.array): 1 dimenssional array of bool, int or floats

    Returns:
        rpy2.rinterface.SexpVector: R vector of compatible data type
    """

    if np_vector.ndim != 1:
        msg = 'Input niput dimension is %s and MUST be 1' % np_vector.ndim
        raise ValueError(msg)

    d_type = np_vector.dtype
    if np.issubdtype(d_type, np.integer):
        return rinterface.IntSexpVector(np_vector)
        # return r_vec(np_vector, Dtypes.int)
    elif np.issubdtype(d_type, np.floating):
        return rinterface.FloatSexpVector(np_vector)
        # return r_vec(np_vector, Dtypes.float)
    elif np.issubdtype(d_type, np.bool):
        return rinterface.BoolSexpVector(np_vector)
        # return r_vec(np_vector,Dtypes.bool)
    elif np.issubdtype(d_type, np.str):
        return rinterface.StrSexpVector(np_vector)
        # return r_vec(np_vector,Dtypes.string)
    else:
        msg = "Can't convert vectors with dtype %s yet" % d_type
        raise NotImplementedError(msg)


def r_matrix(np_matrix, col_names=None):
    """Convert a numpymatrix to R matrix. If no columns are provided
    it will assign the following ['x_1', 'x_2',... ] as column names
    """

    if np_matrix.ndim != 2:
        msg = 'Input input dimension is %s and MUST be 2' % np_matrix.ndim
        raise ValueError(msg)

    n_row, n_col = np_matrix.shape
    r_mat = robjects.r.matrix(np_matrix, nrow=n_row, ncol=n_col)
    if col_names is None:
        col_names = ['x_%s' %(i+1) for i in range(n_col)]

    r_mat.colnames = robjects.StrVector(col_names)
    return r_mat

from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

def r_dataframe(pd_dataframe):
    with localconverter(robjects.default_converter + pandas2ri.converter):
        r_from_pd_df = robjects.conversion.py2rpy(pd_dataframe)
    return r_from_pd_df
    # return pandas2ri.py2ri(pd_dataframe)


def parse_r_list(r_list):
    return dict(zip(r_list.names, map(np.array, r_list)))


MemoryError: Cannot allocate write+execute memory for ffi.callback(). You might be running on a system that prevents this. For more information, see https://cffi.readthedocs.io/en/latest/using.html#callbacks

In [None]:
import numpy as np
import pandas as pd

from collections import namedtuple
from rpy2.robjects import pandas2ri, numpy2ri
from rpy2.robjects.packages import importr as import_R
from scipy.stats import norm as normal


numpy2ri.activate()


r_cmprsk = import_R('cmprsk')


class NonNumericCovariateError(Exception):
    pass



def crr(failure_time, failure_status, static_covariates, cengroup=None, failcode=1, cencode=0,
        subset=None, **kwargs):
    """
    Args:
        failure_time (np.array or pandas.Series): vector of failure/censoring times
        failure_status (np.array or pandas.Series): vector with a unique code for each failure type and a separate
            code for censored observations
        static_covariates (pd.DataFrame): time independent covariates. numeric only dataframe

    Keyword Args:
        cengroup (np.array ofpandas.Series): vector with different values for
            each group with a distinct censoring distribution
        failcode (int): code of fstatus that denotes the failure type of interest
        cencode (int): code of fstatus that denotes censored observations
        subset (numpy.array or pandas Series): a logical vector specifying a subset of cases
            to include in the analysis

    Note:
        na.action is `omit`

    Returns:
        CrrResult: a wrapper around crr_result
    """
    non_numeric_cols = non_numeric_columns(static_covariates)
    if non_numeric_cols:
        msg = """

        Input dataframe contains non numeric columns: {}.
        Please convert text columns using `rpy_utils.to_categorical` method first""".format(non_numeric_cols)
        raise NonNumericCovariateError(msg)

    if isinstance(failure_time, pd.Series):
        failure_time = failure_time.values
    r_ftime = rpy_utils.r_vector(failure_time)

    if isinstance(failure_status, pd.Series):
        failure_status = failure_status.values
    r_fstatus = rpy_utils.r_vector(failure_status)

    r_static_cov = rpy_utils.r_dataframe(static_covariates)

    if cengroup is not None:
        r_cengroup = rpy_utils.r_vector(cengroup)
        kwargs['cengroup'] = r_cengroup

    if subset is not None:
        if isinstance(subset, pd.Series):
            subset = subset.values
        r_subset = rpy_utils.r_vector(subset)
        kwargs['subset'] = r_subset

    r_crr_result = r_cmprsk.crr(r_ftime, r_fstatus, r_static_cov,
                                failcode=failcode, cencode=cencode, **kwargs)

In [6]:
cov_1 = as_indicators(cov, ['x1', 'x4'], bases=['d', 5])

In [9]:
cov_1

Unnamed: 0,x2,x3,x1_a,x1_b,x1_c,x1_e,x1_f,x1_g,x1_h,x1_i,...,x1_z,x4_0,x4_1,x4_2,x4_3,x4_4,x4_6,x4_7,x4_8,x4_9
0,0.642785,-1.067620,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.102957,-0.142379,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.676518,0.120296,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.641450,0.514439,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.190195,0.711615,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.810652,-0.922165,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,1.465283,0.869606,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,2.896172,1.355638,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
198,1.894772,0.413435,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
