In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import string
import tweedie
#from pandas_prof
import importlib
import os, sys
from pathlib import Path
#sys.path.append(Path(os.getcwd()).parents[1])
import dsutils
from feature_engine import categorical_encoders as ce
from feature_engine import discretisers as dsc
from feature_engine import missing_data_imputers as mdi
from sklearn.pipeline import Pipeline as pipe

In [2]:
from dsutils.dsutils.transformers import *
#from dsutils.dsutils.MissingHandler import MissingHandler
from dsutils.dsutils.helpers import histograms
from dsutils.dsutils.pipeline import Pipeline

In [3]:
n=1000
np.random.seed(12345)
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
start_u = start.value//10**9
end_u = end.value//10**9

df = pd.DataFrame(
{
    'V': pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64))),
    'W': np.random.choice(range(10),n).astype(float),
    'X': np.random.choice(list(string.ascii_lowercase),n),
    'Y': np.random.normal(size=n),
    'Z': np.random.choice(list(string.ascii_lowercase),n)
}
)

na_mask = (np.random.binomial(n=1,p=0.2,size = df.shape)).astype(bool)
df[na_mask] = np.nan

p_trn = 0.8
df_train = df.iloc[:int(n*p_trn),:]
df_test = df.iloc[int(n*p_trn):,:]

df_train.head(10)

Unnamed: 0,V,W,X,Y,Z
0,NaT,7.0,w,,h
1,2015-07-12 15:43:25,9.0,b,,
2,2015-10-11 17:04:01,3.0,,-0.549671,k
3,NaT,4.0,i,,e
4,2017-04-28 01:42:22,4.0,k,-1.276761,b
5,2016-11-23 18:27:10,9.0,b,,
6,2015-09-17 17:41:57,,f,1.108227,l
7,2015-02-11 03:44:34,2.0,j,-1.751994,f
8,2015-03-31 05:09:49,0.0,b,,f
9,2017-07-24 01:10:25,0.0,,,g


In [4]:
df.describe()

Unnamed: 0,W,Y
count,791.0,781.0
mean,4.522124,-0.011962
std,2.862981,0.998696
min,0.0,-3.184377
25%,2.0,-0.666243
50%,4.0,-0.023674
75%,7.0,0.680251
max,9.0,2.523108


In [5]:
df.dtypes.to_dict()

{'V': dtype('<M8[ns]'),
 'W': dtype('float64'),
 'X': dtype('O'),
 'Y': dtype('float64'),
 'Z': dtype('O')}

In [6]:
p = Pipeline([
    ("num_nan_ind",MissingIndicator(x=['W','Y'])),
    ("fill_cat_nas",ReplaceMissingCategorical(x=['X','Z'])),
    ("pcb",PercentThresholdBinner(x='X',percent_threshold = 0.03)),
    ("max_level_bin",MaxLevelBinner(x='Z',max_levels=15)),
    ("rmmean",ReplaceMissingMean(x=['W','Y'])),
    ("date_comp",DateComponents(x="V"))
])

In [7]:
p.fit_transform(df_train).head(10)

Unnamed: 0,V,W,X,Y,Z,W_NA_IND,Y_NA_IND,V_YEAR,V_MONTH,V_DAY
0,NaT,7.0,w,-0.010568,h,0,1,,,
1,2015-07-12 15:43:25,9.0,b,-0.010568,_MISSING_,0,1,2015.0,7.0,12.0
2,2015-10-11 17:04:01,3.0,_MISSING_,-0.549671,k,0,0,2015.0,10.0,11.0
3,NaT,4.0,i,-0.010568,e,0,1,,,
4,2017-04-28 01:42:22,4.0,_OTHER_,-1.276761,b,0,0,2017.0,4.0,28.0
5,2016-11-23 18:27:10,9.0,b,-0.010568,_MISSING_,0,1,2016.0,11.0,23.0
6,2015-09-17 17:41:57,4.459459,f,1.108227,l,1,0,2015.0,9.0,17.0
7,2015-02-11 03:44:34,2.0,_OTHER_,-1.751994,f,0,0,2015.0,2.0,11.0
8,2015-03-31 05:09:49,0.0,b,-0.010568,f,0,1,2015.0,3.0,31.0
9,2017-07-24 01:10:25,0.0,_MISSING_,-0.010568,_OTHER_,0,1,2017.0,7.0,24.0


In [8]:
pipe_def = pipe([
    ("num_nan_ind",MissingIndicator(x=['W','Y'])),
    ("missing_inds",mdi.AddMissingIndicator(variables = ['X','Z'])),
    ("date_comp",DateComponents(x="V"))
])

In [9]:
pipe_def.fit_transform(df)

Unnamed: 0,V,W,X,Y,Z,W_NA_IND,Y_NA_IND,X_na,Z_na,V_YEAR,V_MONTH,V_DAY
0,NaT,7.0,w,,h,0,1,0,0,,,
1,2015-07-12 15:43:25,9.0,b,,,0,1,0,1,2015.0,7.0,12.0
2,2015-10-11 17:04:01,3.0,,-0.549671,k,0,0,1,0,2015.0,10.0,11.0
3,NaT,4.0,i,,e,0,1,0,0,,,
4,2017-04-28 01:42:22,4.0,k,-1.276761,b,0,0,0,0,2017.0,4.0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015-03-21 01:28:43,7.0,,-0.294869,f,0,0,1,0,2015.0,3.0,21.0
996,2017-08-15 17:18:49,0.0,x,-0.200316,a,0,0,0,0,2017.0,8.0,15.0
997,NaT,,j,1.017884,k,1,0,0,0,,,
998,2017-04-22 16:46:22,4.0,i,-1.377053,,0,0,0,1,2017.0,4.0,22.0


In [7]:



# df.V.dt.year
# df.V.dt.month
# df.V.dt.day
# df.V.dt.hour

0       NaN
1      15.0
2      17.0
3       NaN
4       1.0
       ... 
995     1.0
996    17.0
997     NaN
998    16.0
999    10.0
Name: V, Length: 1000, dtype: float64

In [22]:
df.V.astype('datetime64[ns]').dt.minute

0       NaN
1      43.0
2       4.0
3       NaN
4      42.0
       ... 
995    28.0
996    18.0
997     NaN
998    46.0
999    55.0
Name: V, Length: 1000, dtype: float64

In [6]:
p.transform(df_test).head(10)

Unnamed: 0,W,X,Y,Z,W_NA_IND,Y_NA_IND
800,0.0,_OTHER_,0.013768,a,0,1
801,0.0,y,-0.903354,s,0,0
802,5.0,y,0.184216,_OTHER_,0,0
803,7.0,n,0.013768,_MISSING_,0,1
804,8.0,_OTHER_,0.407492,y,0,0
805,5.0,t,0.812104,a,0,0
806,9.0,_MISSING_,0.66635,_OTHER_,0,0
807,8.0,_MISSING_,0.598134,e,0,0
808,2.0,_OTHER_,0.384338,_MISSING_,0,0
809,4.0,t,-0.086291,s,0,0


In [9]:
df

Unnamed: 0,V,W,X,Y,Z
0,NaT,7.0,w,,h
1,2015-07-12 15:43:25,9.0,b,,
2,2015-10-11 17:04:01,3.0,,-0.549671,k
3,NaT,4.0,i,,e
4,2017-04-28 01:42:22,4.0,k,-1.276761,b
...,...,...,...,...,...
995,2015-03-21 01:28:43,7.0,,-0.294869,f
996,2017-08-15 17:18:49,0.0,x,-0.200316,a
997,NaT,,j,1.017884,k
998,2017-04-22 16:46:22,4.0,i,-1.377053,


In [36]:
def col_types(df):
    {col:pd.api.types.is_numeric_dtype(df.loc[:,col])
     for col in df.columns}


for col in df.columns:
    print(col)
    print(pd.api.types.is_numeric_dtype(df.loc[:,col]))

x
False
y
True
z
True


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       78 non-null     object 
 1   y       82 non-null     float64
 2   z       81 non-null     float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [95]:
d = df.dtypes.to_dict()

In [99]:
pd.api.types.is_numeric_dtype(d['w'])

True

In [38]:
df.describe()

Unnamed: 0,y,z
count,82.0,81.0
mean,0.115844,7.163064
std,1.047992,18.26916
min,-2.979359,0.0
25%,-0.455898,0.0
50%,0.048757,0.0
75%,0.837086,0.867553
max,2.613154,92.670684
