# Part 3 - Data Prep

https://www.udemy.com/course/feature-engineering-for-machine-learning

* Types and characteristics of data
* Missing data imputation
* Categorical encoding
* Variable transformation
* Discretization
* Outliers
* Datetime
* Scaling
* Feature creation

## Load Data

In [None]:
import pandas as pd

df = pd.read_csv('created_raw_data.csv')
print(df.shape)
print(df.info())
df.head()

(1010, 41)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   binary                 709 non-null    object 
 1   given_name             503 non-null    object 
 2   surname                808 non-null    object 
 3   date_of_birth          960 non-null    object 
 4   phone_number           960 non-null    object 
 5   email                  910 non-null    object 
 6   address                834 non-null    object 
 7   city                   705 non-null    object 
 8   state                  881 non-null    object 
 9   zipcode                700 non-null    float64
 10  duplicate_2            1010 non-null   float64
 11  random label num 3     505 non-null    object 
 12  multicollinearity 1    1010 non-null   float64
 13  constant_1             1010 non-null   object 
 14  correlated w target 2  505 non-null    float6

Unnamed: 0,binary,given_name,surname,date_of_birth,phone_number,email,address,city,state,zipcode,...,random choice 7,random label num 11,informative_1,uniform corr 1,pd qcut2,multicollinearity 3,standard scaling,multicollinearity 4,multicollinearity 2,class
0,binary_2,Troy,Allen,1996-08-01,974-794-0969x179,idillon@example.net,,,,,...,Sunday,label num hi 10,-0.999102,0.353397,,-0.30876,,-0.276196,-0.369239,1
1,,,Fischer,2017-09-18,+1-758-724-2680,johnsmith@example.org,,,,,...,Thursday,label num hi 2,1.246686,0.800222,Q3,-0.276773,,0.154751,-1.074681,1
2,binary_1,Shelley,Russell,1914-12-29,+1-226-526-4206x24792,gentrylaura@example.net,3920 Simmons Village Apt. 675,Johnfurt,CA,94315.0,...,Wednesday,label num hi 7,0.962777,0.760738,,-0.542515,61925.695824,-0.271426,0.350587,1
3,binary_2,,Conrad,1955-08-15,835.630.0141x7025,reeddeborah@example.org,320 Oneal Common,Williamchester,NJ,64638.0,...,Thursday,label num hi 1,-2.957441,0.597439,Q4,0.110135,51333.370521,0.433609,,1
4,binary_2,,,2001-10-24,458.266.1388,travis85@example.net,511 Charles View Apt. 670,,ND,64272.0,...,Tuesday,label num hi 10,1.141165,0.837511,Q3,-1.050837,53993.5403,-1.585647,0.406502,1


In [None]:
import preppy.utils as utils
from preppy.version import __version__

print(__version__)

utils.report.write_report(df, thresh=.5)

PrepPy Version: 0.1.0
REPORT FOR DATA PREP

#################################################
Columns with Constant Values
#################################################
['constant_1', 'constant_2']

#################################################
Columns with Quasi-Constant Values
#################################################
['class', 'constant_1', 'constant_2', 'random choice 2']

#################################################
Duplicate Rows
#################################################
20

#################################################
Duplicate Columns
#################################################
['informative_2', 'constant_2', 'informative_1']

#################################################
Variables with Noticeably Higher Scales
#################################################
Features with Noticeably Higher Scales (Based on Standard Deviation):
zipcode             28647.576417
standard scaling     9847.664968
Name: std, dtype: float64

Features with 

In [None]:
import preppy.utils as preppy

consts = preppy.functions.identify_consts(df)
quasi_consts = preppy.functions.identify_quasi_consts(df)
duplicates = preppy.functions.check_col_duplicates(df)
print(duplicates)
print(consts)
print(quasi_consts)

['informative_2', 'constant_2', 'informative_1']
['constant_1', 'constant_2']
['constant_1', 'constant_2']


In [None]:
# numeric_df = df.apply(pd.to_numeric, errors='coerce')
all_deletes = list(set(consts + quasi_consts + duplicates))
for col in all_deletes:
  print(col, df[col].dtype)
  if df[col].dtype in ['float64', 'int64']:
    df_numerical.remove(col)
  elif df[col].dtype in ['object']:
    df_object.remove(col)
    df_categorical_features.remove(col)
  else:
    df_discreet.remove(col)


constant_2 object
constant_1 object
informative_1 float64
informative_2 float64


## PrepPy Pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import preppy.utils as preppy

pipe = [
    ('constants', preppy.classes.RemoveConstants()),
    ('quasiconsts', preppy.classes.RemoveQuasiConstants(thresh=0.8)),
    ('duplicates', preppy.classes.DropDuplicates()),
    ('missing', preppy.classes.HandleMissingValues()),
    # ('encoding', HandleCatEncodeing())
]

pipe_model = Pipeline(pipe)
data = pipe_model.fit_transform(df)
cols = [col for col in df.columns if col not in consts + quasi_consts + duplicates]
nu_df = pd.DataFrame(data, columns=cols)
nu_df.info()

  dfx[feat] = df[feat].fillna(df[feat].mode()[0])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   binary                 1000 non-null   object
 1   given_name             1000 non-null   object
 2   surname                1000 non-null   object
 3   date_of_birth          1000 non-null   object
 4   phone_number           1000 non-null   object
 5   email                  1000 non-null   object
 6   address                1000 non-null   object
 7   city                   1000 non-null   object
 8   state                  1000 non-null   object
 9   zipcode                1000 non-null   object
 10  duplicate_2            1000 non-null   object
 11  random label num 3     1000 non-null   object
 12  multicollinearity 1    1000 non-null   object
 13  correlated w target 2  1000 non-null   object
 14  semi_constant_1        1000 non-null   object
 15  corr_feature_class    

In [None]:
import pickle

# Load the pickled variable from the file
with open('var_types.pkl', 'rb') as f:
    var_types = pickle.load(f)

print(var_types)

{'df_numerical': ['zipcode', 'duplicate_2', 'multicollinearity 1', 'correlated w target 2', 'corr_feature_class', 'uniform corr 2', 'duplicate_1', 'correlated w target 1', 'outliers 1', 'informative_2', 'outliers 2', 'target', 'min max scaling', 'informative_1', 'uniform corr 1', 'multicollinearity 3', 'standard scaling', 'multicollinearity 4', 'multicollinearity 2', 'class'], 'df_object': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'random label num 3', 'constant_1', 'semi_constant_1', 'constant_2', 'pd qcut1', 'random choice 4', 'random choice 2', 'semi_constant_2', 'pd qcut3', 'random choice 7', 'random label num 11', 'pd qcut2'], 'df_discreet': [], 'df_categorical_features': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'random label num 3', 'constant_1', 'semi_constant_1', 'constant_2', 'pd qcut1', 'random choice 4', 'random choice 2', 'semi_constant_2', 'pd qc

In [None]:
df_numerical = var_types['df_numerical']
df_object = var_types['df_object']
df_discreet = var_types['df_discreet']
df_categorical_features = var_types['df_categorical_features']

In [None]:
# code along
df_numerical = [col for col in nu_df.columns if col in var_types['df_numerical']]
nu_df[df_numerical] = nu_df[df_numerical].astype(float)
nu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   binary                 1000 non-null   object 
 1   given_name             1000 non-null   object 
 2   surname                1000 non-null   object 
 3   date_of_birth          1000 non-null   object 
 4   phone_number           1000 non-null   object 
 5   email                  1000 non-null   object 
 6   address                1000 non-null   object 
 7   city                   1000 non-null   object 
 8   state                  1000 non-null   object 
 9   zipcode                1000 non-null   float64
 10  duplicate_2            1000 non-null   float64
 11  random label num 3     1000 non-null   object 
 12  multicollinearity 1    1000 non-null   float64
 13  correlated w target 2  1000 non-null   float64
 14  semi_constant_1        1000 non-null   object 
 15  corr_

## Feature Engineering

### Feature Combination

In [None]:
# create a new variable by combining two variables
df['scaling_combined'] = df['standard scaling'] + df['min max scaling']
df.drop(['standard scaling', 'min max scaling'], axis=1, inplace=True)

### Categorical Encoding

In [None]:
# code along
import preppy.utils as utils

# Impute missing values before applying do_OHE
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

df = utils.functions.do_OHE(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   binary                             709 non-null    float64
 1   given_name                         1010 non-null   float64
 2   surname                            1010 non-null   float64
 3   date_of_birth                      1010 non-null   float64
 4   phone_number                       1010 non-null   float64
 5   email                              1010 non-null   float64
 6   address                            1010 non-null   float64
 7   city                               1010 non-null   float64
 8   state                              1010 non-null   float64
 9   zipcode                            700 non-null    float64
 10  duplicate_2                        1010 non-null   float64
 11  multicollinearity 1                1010 non-null   float

In [None]:
df.to_csv('prepared_data.csv', index=False)