In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import string
import tweedie
import datetime
#from pandas_prof
import importlib
import os, sys
from pathlib import Path
#sys.path.append(Path(os.getcwd()).parents[1])
import dsutils
from feature_engine import categorical_encoders as ce
from feature_engine import discretisers as dsc
from feature_engine import missing_data_imputers as mdi
from feature_engine import feature_selection as fs
from sklearn.pipeline import Pipeline as pipe
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse

In [2]:
from dsutils.dsutils.transformers import *
#from dsutils.dsutils.MissingHandler import MissingHandler
from dsutils.dsutils.utils import histograms
from dsutils.dsutils.pipeline import Pipeline

In [3]:
n=1000
np.random.seed(12345)
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
start_u = start.value//10**9
end_u = end.value//10**9

df = pd.DataFrame(
{
    'V1': pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64))).astype('str'),
    'W': np.random.choice(range(10),n).astype(float),
    'X': np.random.choice(list(string.ascii_lowercase),n),
    'Y': np.random.normal(size=n),
    'Z': np.random.choice(list(string.ascii_lowercase),n)
}
)

df['V2'] = (pd.to_datetime(df['V1']) + pd.Series([
    datetime.timedelta(days = i) for i in 
    100*np.random.uniform(low = -0.05, high = 1, size=n)
])).astype('object')

na_mask = (np.random.binomial(n=1,p=0.2,size = df.shape)).astype(bool)
df[na_mask] = np.nan

p_trn = 0.8
df_train = df.iloc[:int(n*p_trn),:]
df_test = df.iloc[int(n*p_trn):,:]

df_train.head(10)

Unnamed: 0,V1,W,X,Y,Z,V2
0,,7.0,w,,h,2017-04-24 17:07:23.900775
1,2015-07-12 15:43:25,,b,-0.208885,,
2,2015-10-11 17:04:01,3.0,,-0.549671,k,
3,2016-02-28 00:13:29,4.0,,-1.25288,,2016-05-26 09:25:13.153521
4,2017-04-28 01:42:22,4.0,k,-1.276761,b,2017-06-10 15:48:14.022613
5,2016-11-23 18:27:10,9.0,b,1.881156,x,2017-01-07 20:18:12.646987
6,2015-09-17 17:41:57,,f,1.108227,l,2015-11-14 01:13:47.159438
7,2015-02-11 03:44:34,2.0,j,-1.751994,f,
8,2015-03-31 05:09:49,0.0,,-0.973899,,
9,2017-07-24 01:10:25,0.0,c,0.908732,g,2017-10-16 19:00:58.317513


In [4]:
#df['V'] = df.V.astype('object')
dtype_dict = df.dtypes.to_dict()
dtype_dict

{'V1': dtype('O'),
 'W': dtype('float64'),
 'X': dtype('O'),
 'Y': dtype('float64'),
 'Z': dtype('O'),
 'V2': dtype('O')}

In [5]:
from dsutils.dsutils.utils.formatters import datetime_tester
        
potential_dts = [k for k,v in datetime_tester(df).items() if pd.api.types.is_datetime64_any_dtype(v)]

In [6]:
p = Pipeline([
    ("num_nan_ind",MissingIndicator(x=['W','Y'])),
    ("fill_cat_nas",ReplaceMissingCategorical(x=['X','Z'])),
    ("pcb",PercentThresholdBinner(x='X',percent_threshold = 0.03)),
    ("max_level_bin",MaxLevelBinner(x='Z',max_levels=15)),
    ("rmmean",ReplaceMissingMean(x=['W','Y'])),
    ("date_comp",DateComponents(x=["V1","V2"])),
    ("drop_date",fs.DropFeatures(features_to_drop=['V1','V2']))
])

In [7]:
p.fit_transform(df_train).head(10)

AttributeError: Can only use .dt accessor with datetimelike values

In [41]:
pipe_def = pipe([
    ("num_nan_ind",mdi.AddMissingIndicator(variables=['W','Y'])),
    ("fill_cat_nas",ReplaceMissingCategorical(x=['X','Z'])),
    ("pcb",PercentThresholdBinner(x='X',percent_threshold = 0.03)),
    ("max_level_bin",MaxLevelBinner(x='Z',max_levels=15)),
    ("rmmean",mdi.MeanMedianImputer(imputation_method='mean',variables=['W','Y'])),
    ("date_comp",DateComponents(x="V")),
    ("fill_missing_year",mdi.ArbitraryNumberImputer(
        arbitrary_number=-1,variables = ['V_YEAR','V_MONTH','V_DAY'])),
    ("drop_date",fs.DropFeatures(features_to_drop=['V'])),
    ("onehot",ce.OneHotCategoricalEncoder(variables = ['X','Z']))
])

In [46]:
pipe_def.fit_transform(df).to_numpy().nbytes

344000

In [49]:
sparse.csr_matrix(pipe_def.fit_transform(df).to_numpy()).data.nbytes

58832

In [32]:
pipe_def.transform(df).select_dtypes('object').head()

Unnamed: 0,X,Z
0,w,h
1,b,_MISSING_
2,_MISSING_,k
3,i,_OTHER_
4,_OTHER_,b


In [30]:
ohe = OneHotEncoder().fit(pipe_def.transform(df).select_dtypes('object'))

In [33]:
ohe.get_feature_names(['X','Z'])

array(['X__MISSING_', 'X__OTHER_', 'X_b', 'X_d', 'X_e', 'X_f', 'X_g',
       'X_h', 'X_i', 'X_l', 'X_n', 'X_o', 'X_p', 'X_r', 'X_s', 'X_u',
       'X_w', 'X_x', 'X_y', 'X_z', 'Z__MISSING_', 'Z__OTHER_', 'Z_a',
       'Z_b', 'Z_d', 'Z_f', 'Z_h', 'Z_k', 'Z_l', 'Z_m', 'Z_r', 'Z_s',
       'Z_t', 'Z_u', 'Z_w', 'Z_x'], dtype=object)

In [11]:
pd.DataFrame(
    {'x':['a','a','b','c','b','a',np.nan,'a',np.nan]}
    )


# df.V.dt.year
# df.V.dt.month
# df.V.dt.day
# df.V.dt.hour

Unnamed: 0,x
0,a
1,a
2,b
3,c
4,b
5,a
6,
7,a
8,


In [22]:
df.V.astype('datetime64[ns]').dt.minute

0       NaN
1      43.0
2       4.0
3       NaN
4      42.0
       ... 
995    28.0
996    18.0
997     NaN
998    46.0
999    55.0
Name: V, Length: 1000, dtype: float64

In [6]:
p.transform(df_test).head(10)

Unnamed: 0,W,X,Y,Z,W_NA_IND,Y_NA_IND
800,0.0,_OTHER_,0.013768,a,0,1
801,0.0,y,-0.903354,s,0,0
802,5.0,y,0.184216,_OTHER_,0,0
803,7.0,n,0.013768,_MISSING_,0,1
804,8.0,_OTHER_,0.407492,y,0,0
805,5.0,t,0.812104,a,0,0
806,9.0,_MISSING_,0.66635,_OTHER_,0,0
807,8.0,_MISSING_,0.598134,e,0,0
808,2.0,_OTHER_,0.384338,_MISSING_,0,0
809,4.0,t,-0.086291,s,0,0


In [9]:
df

Unnamed: 0,V,W,X,Y,Z
0,NaT,7.0,w,,h
1,2015-07-12 15:43:25,9.0,b,,
2,2015-10-11 17:04:01,3.0,,-0.549671,k
3,NaT,4.0,i,,e
4,2017-04-28 01:42:22,4.0,k,-1.276761,b
...,...,...,...,...,...
995,2015-03-21 01:28:43,7.0,,-0.294869,f
996,2017-08-15 17:18:49,0.0,x,-0.200316,a
997,NaT,,j,1.017884,k
998,2017-04-22 16:46:22,4.0,i,-1.377053,


In [36]:
def col_types(df):
    {col:pd.api.types.is_numeric_dtype(df.loc[:,col])
     for col in df.columns}


for col in df.columns:
    print(col)
    print(pd.api.types.is_numeric_dtype(df.loc[:,col]))

x
False
y
True
z
True


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       78 non-null     object 
 1   y       82 non-null     float64
 2   z       81 non-null     float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [95]:
d = df.dtypes.to_dict()

In [99]:
pd.api.types.is_numeric_dtype(d['w'])

True

In [38]:
df.describe()

Unnamed: 0,y,z
count,82.0,81.0
mean,0.115844,7.163064
std,1.047992,18.26916
min,-2.979359,0.0
25%,-0.455898,0.0
50%,0.048757,0.0
75%,0.837086,0.867553
max,2.613154,92.670684
