In [139]:
import pandas as pd
import numpy as np

from shelby import basics as sbasics
from shelby import pull_push_data as sdata
from shelby import data_cleaning as scleaning
from shelby import data_preparation as sprep
from shelby import modeling as smodeling

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Basic operations


**Using:**
- pull_push_data
    - load data from csv
    
- basics
    - separate columns by types
    - stack dfs
    
- data_cleaning
    - correct finall df's columns' types
    
**Steps:**
- Load data
- Stack train and test
- Separate columns and correct types of full_df

_Load data_

In [140]:
df_train, df_test = sdata.read_data(train_path='./data/train.csv',
                                    test_path='./data/test.csv',
                                    index_col='Id')

_Stack train and test_

In [141]:
full_df, target_col, test_start_index = sbasics.train_test_stack(df_train=df_train,
                                                                df_test=df_test,
                                                                target_col_name='SalePrice')

_Separate columns and correct types of finall df_

In [142]:
cat_cols, num_cols = sbasics.separate_cols(full_df,
                                           unique_thresh=20,
                                           return_probably_cat=False)

full_df = scleaning.TypesCorrector(num_cols=num_cols,
                                   cat_cols=cat_cols).fit_transform(full_df)

## Data preprocessing
**Using:**
- data_cleaning
    - filling NaNs
    
- data_preparation
    - remove skew from skewed num_features
    - get dummies for categorical columns
    - get array representation of data and split full_df back to train and test
    
**Steps:**
- Init sklearn.pipeline.Pipeline
- Apply pipeline to full_df
- Log-transformation of the target variable

_Init sklearn.pipeline.Pipeline_

In [143]:
preprocessing_pipeline = Pipeline([
    ('Fill NaNs in numerical columns with 0',          scleaning.NumNanFiller(num_cols, method=0)),
    ('Fill NaNs in categorical columns with NO VALUE', scleaning.CatNanFiller(cat_cols, method='indicator')),
    ('Remove Skew from numerical columns',             sprep.SkewRemover(num_cols, method='log')),
    ('Generate dummies for categorical features',      sprep.CatDummifier(cat_cols)),
    ('Get data in array representation',               sprep.ArraysExtractor(target_col, test_start_index))
])

_Apply it!_

In [144]:
X, y, X_finall = preprocessing_pipeline.fit_transform(full_df)

_Log-transformation of the target variable_

In [145]:
y = np.log1p(y)

## Check data
**Steps:**
- Check array (just a quick look)
- Check shapes 
- Check NaNs
- Check dtypes

_Check arrays_

In [146]:
print('X:\n\n',X)

X:

 [[6.500000e+01 9.042040e+00 2.003000e+03 ... 1.000000e+00 0.000000e+00
  0.000000e+00]
 [8.000000e+01 9.169622e+00 1.976000e+03 ... 0.000000e+00 0.000000e+00
  0.000000e+00]
 [6.800000e+01 9.328213e+00 2.001000e+03 ... 1.000000e+00 0.000000e+00
  0.000000e+00]
 ...
 [6.600000e+01 9.109746e+00 1.941000e+03 ... 0.000000e+00 0.000000e+00
  1.000000e+00]
 [6.800000e+01 9.181735e+00 1.950000e+03 ... 0.000000e+00 0.000000e+00
  1.000000e+00]
 [7.500000e+01 9.204121e+00 1.965000e+03 ... 1.000000e+00 0.000000e+00
  0.000000e+00]]


In [147]:
print('y:\n\n',y)

y:

 [12.24769912 12.10901644 12.31717117 ... 12.49313327 11.86446927
 11.90159023]


In [148]:
print('X_finall:\n\n',X_finall)

X_finall:

 [[8.000000e+01 9.360742e+00 1.961000e+03 ... 0.000000e+00 0.000000e+00
  1.000000e+00]
 [8.100000e+01 9.565775e+00 1.958000e+03 ... 0.000000e+00 0.000000e+00
  1.000000e+00]
 [7.400000e+01 9.534668e+00 1.997000e+03 ... 0.000000e+00 0.000000e+00
  1.000000e+00]
 ...
 [1.600000e+02 9.903538e+00 1.960000e+03 ... 0.000000e+00 0.000000e+00
  0.000000e+00]
 [6.200000e+01 9.253592e+00 1.992000e+03 ... 0.000000e+00 0.000000e+00
  0.000000e+00]
 [7.400000e+01 9.172431e+00 1.993000e+03 ... 0.000000e+00 0.000000e+00
  0.000000e+00]]


_Check shapes_

In [149]:
print(f'df_train shape: {df_train.shape}\nX shape: {X.shape}')
print(f'y shape: {y.shape}\n----')
print(f'df_test shape: {df_test.shape}\nX_finall shape: {X_finall.shape}\n')


df_train shape: (1460, 80)
X shape: (1460, 359)
y shape: (1460,)
----
df_test shape: (1459, 79)
X_finall shape: (1459, 359)



_Check NaNs_

In [150]:
print(f'X contain NaNs: {np.isnan(X).any()}')
print(f'y contain NaNs: {np.isnan(y).any()}')
print(f'X_finall contain NaNs: {np.isnan(X_finall).any()}')

X contain NaNs: False
y contain NaNs: False
X_finall contain NaNs: False


_Check dtypes_

In [151]:
print(f'X dtype = {X.dtype}')
print(f'y dtype = {y.dtype}')
print(f'X_finall dtype = {X_finall.dtype}')

X dtype = float32
y dtype = float64
X_finall dtype = float32
