In [6]:
import pandas as pd
import numpy as np

from scipy.stats import norm, rankdata

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
np.random.seed(123)

import utils

In [2]:
train_file = '../data/train.csv'
df_train = pd.read_csv(train_file)
df_train.drop(['ID_code'],axis=1, inplace=True)

target = df_train.pop('target')

test_file = '../data/test.csv'
df_test = pd.read_csv(test_file)
df_test.drop(['ID_code'],axis=1, inplace=True)

In [8]:
def count_negative(series):
    # "between" returns a boolean Series equivalent to left <= series <= right.
    # NA values will be treated as False.
    return (series <= 0).sum()
# Alternative approach:
#     series.between(left=range_min, right=range_max).sum()

def row_features(df):
    df['sum']  = df.sum(axis=1)  
    df['min']  = df.min(axis=1)
    df['max']  = df.max(axis=1)
    df['mean'] = df.mean(axis=1)
    df['std']  = df.std(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurtosis(axis=1)
    df['med']  = df.median(axis=1)
    
    # count of +ve and -ve values in a row and something of this sort, 5 cols
    df["negative_count"] = df.apply(func = lambda row: count_negative(row), axis=1)
    df["positive_count"] = len(df.columns) - df["negative_count"]
    return df


def col_features(df):
    for col in df.columns:
        # Square
        df['sq_'+col] = df[col] * df[col]

        # Cube
#         df[col+'_c'] = df[col] * df[col] * df[col]

        # 4th power
#         df[col+'_q'] = df[col] * df[col] * df[col] * df[col]

        # Cumulative percentile (not normalized)
        df['r_'+col] = rankdata(df[col]).astype('float32')

        # Cumulative normal percentile
        df['n_'+col] = norm.cdf(df[col]).astype('float32')
        
        df['mean_'+col] = (df[col].mean()-df[col])
        
        df['z_'+col] = (df[col] - df[col].mean())/df[col].std(ddof=0)
        
        df['sqrt_'+col] = (df['sq_'+col])**(1/4)
        
        df['log_'+col] = np.log(df['sq_'+col]+10)/2
        
    return df

In [7]:
df_train = row_features(df_train)
df_test  = row_features(df_test)

In [9]:
utils.to_pickles(df_train, '../data/train')
utils.to_pickles(df_test, '../data/test')

3it [00:00,  4.14it/s]
3it [00:00,  4.02it/s]


In [3]:
df_train = utils.read_pickles('../data/train')
df_test  = utils.read_pickles('../data/test')

100%|██████████| 3/3 [00:04<00:00,  1.51s/it]
100%|██████████| 3/3 [00:04<00:00,  1.38s/it]


In [9]:
df_train = col_features(df_train)
df_test  = col_features(df_test)

In [10]:
utils.to_pickles(df_train, '../data/train')
utils.to_pickles(df_test, '../data/test')

3it [00:26,  8.80s/it]
3it [00:23,  8.45s/it]
