In [2]:
import pandas as pd
import numpy as np

In [90]:
dct = {'a':[10,2,43,4,5,6], 
       'b':['2020-01-04',np.nan,'2020-01-03','2020-01-01','2020-01-02','2020-01-01'],
       'c' : [1.0, np.nan, 0.1, 0.4, 0.6,1.5],
       'd' : ['a', 'b', 'c','a', 'b', 'c'],
       'e' : [1,1,1,1,1,1],
       'f' : [1,1,0,1,1,np.nan],
       'g': [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]}

df = pd.DataFrame(dct)

In [91]:
df

Unnamed: 0,a,b,c,d,e,f,g
0,10,2020-01-04,1.0,a,1,1.0,
1,2,,,b,1,1.0,
2,43,2020-01-03,0.1,c,1,0.0,
3,4,2020-01-01,0.4,a,1,1.0,
4,5,2020-01-02,0.6,b,1,1.0,
5,6,2020-01-01,1.5,c,1,,


In [93]:
pd.to_datetime(df['b']).dtype

dtype('<M8[ns]')

In [9]:
pd.Series([True, False, True]).astype('float')

0    1.0
1    0.0
2    1.0
dtype: float64

In [57]:
def classify_feature_types(df, feats=None):
    """
    get inferred feature types by trying df.astype.
    This will try to cast each type as either
        - numeric: encompassing, int, float and bool types or anything else that
            can be successfully cast as float
        - datetime: anything that pandas can successfully cast to datetime
        - object: anything else, typically treated as a string

    Parameters
    ----------
    df : dataframe
    feats : list (optional)
        list of features

    Returns
    -------
    dict
        column name : inferred type (numeric, datetime or object)
    """
    def test_type(ser, _type):
        try:
            _ = ser.astype(_type)
            return True
        except ValueError:
            return False

    if not feats:
        feats = df.columns.tolist()

    _types = {}
    type_mapping = [('numeric', 'float64'),
                    ('datetime', 'M8[us]'),
                    ('object', 'object')]
    for col in feats:
        for k, _type in type_mapping:
            if test_type(df[col], _type):
                _types[col] = k
                break
    return _types

In [58]:
classify_feature_types(df)

{'a': 'numeric',
 'b': 'datetime',
 'c': 'numeric',
 'd': 'object',
 'e': 'numeric',
 'f': 'numeric',
 'g': 'numeric'}

In [46]:
df['b'].astype('str')

0    2020-01-04
1           nan
2    2020-01-03
3    2020-01-01
4    2020-01-02
5    2020-01-01
Name: b, dtype: object

In [47]:
mapping = {'a': 'numeric',
 'b': 'datetime',
 'c': 'numeric',
 'd': 'object',
 'e': 'numeric'}


In [59]:
def classify_value_counts(df, col, unique_thresh=0.05, type_mapping=None):
    val_counts = df[col].dropna().value_counts()
    if val_counts.empty:
        return 'null'
    elif val_counts.size == 1:
        return 'uninformative'
    elif val_counts.size == 2:
        return 'binary'
    else:
        if not type_mapping:
            type_mapping = classify_feature_types(df[[col]])
        else:
            assert list(set(type_mapping.values))
        if type_mapping[col] == 'numeric':
            assert unique_thresh > 0
            if unique_thresh < 1.0:
                unique_thresh = int(unique_thresh * df.index.size)
                
            if len(val_counts) > unique_thresh:
                return 'continuous'
    return 'categorical'
            


In [61]:
for col in df.columns:
    print(classify_value_counts(df, col))

continuous
categorical
continuous
categorical
uninformative
binary
null


In [37]:
pd.Series([np.nan, np.nan, np.nan]).dropna().value_counts()

Series([], dtype: int64)

In [None]:
col = 'a'
unique_thresh = 4
classify_value_counts(df, col, unique_thresh)

In [38]:
len(val_counts)

6

In [78]:
a = {'numeric', 'categorical'}
b = {'numeric', 'object', 'datetime'}

def _validate_mappings(lst):
    allowed_types = set([x[0] for x in type_mapping])
    diff = set(lst).difference(allowed_types)
    assert not diff, f'{diff} not recognized type mappings: {allowed_types}'

_validate_mappings(list(a))

AssertionError: {'categorical'} not recognized type mappings: {'numeric', 'object', 'datetime'}

In [75]:
assert not {}

In [71]:
type_mapping = [('numeric', 'float64'),
                ('datetime', 'M8[us]'),
                ('object', 'object')]

set([x[0] for x in type_mapping])


{'datetime', 'numeric', 'object'}

In [79]:
df = pd.DataFrame({'a':np.random.sample(1000), 
                   'b':np.random.sample(1000), 
                   'c':np.random.sample(1000), 
                   'c':np.random.sample(1000)})

In [94]:
corr_kwargs = {'method':'spearman'}
def get_correlates(df, thresh=0.9, feats=None, **corr_kwargs):
    if not feats:
        # quickly un uncomprehensively remove object and datetime types.
        feats = [f for f in df.columns.tolist() 
                 if df[f].dtype not in ['object', '<M8[ns]']]
        
    corr_matrix = df.corr(**corr_kwargs).abs()
    corr_pairs = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1)
                          .astype(np.bool))
                     .stack()
                     .sort_values(ascending=False))
    return corr_pairs[corr_pairs > thresh]


In [95]:
get_correlates(df, thresh=0.01, feats=None, method='spearman')

c  f    0.774597
a  f    0.707107
   c    0.100000
dtype: float64