# Chapter 07 Preprocessing Data

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import (
    ensemble,
    model_selection,    
    preprocessing,
    tree,
)

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.experimental import (
    enable_iterative_imputer,
)

In [9]:
url = (
    "http://biostat.mc.vanderbilt.edu/"
    "wiki/pub/Main/DataSets/titanic3.xls"
)
df = pd.read_excel(url)

In [10]:
X2 = pd.DataFrame(
    {
        "a": range(5),
        "b": [-100, -50, 0, 200, 1000],
    }
)
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


## Standardize

In [11]:
from sklearn import preprocessing
std = preprocessing.StandardScaler()
std.fit_transform(X2)

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

In [12]:
std.scale_
std.mean_
std.var_

array([2.000e+00, 1.664e+05])

In [13]:
# pandas version
X_std = (X2 - X2.mean()) / X2.std()
X_std
X_std.mean()
X_std.std()

a    1.0
b    1.0
dtype: float64

## Scale to range

In [16]:
from sklearn import preprocessing
mms = preprocessing.MinMaxScaler()
mms.fit(X2)
mms.transform(X2)

array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

In [17]:
(X2 - X2.min()) / (X2.max() - X2.min())

Unnamed: 0,a,b
0,0.0,0.0
1,0.25,0.045455
2,0.5,0.090909
3,0.75,0.272727
4,1.0,1.0


## Dummy variables

In [18]:
X_cat = pd.DataFrame(
    {
        "name": ["George", "Paul"],
        "inst": ["Bass", "Guitar"],
    }
)
X_cat

Unnamed: 0,inst,name
0,Bass,George
1,Guitar,Paul


In [19]:
pd.get_dummies(X_cat, drop_first=True)

Unnamed: 0,inst_Guitar,name_Paul
0,0,0
1,1,1


## Label encoder

In [21]:
from sklearn import preprocessing
lab = preprocessing.LabelEncoder()
lab.fit_transform(X_cat.name)

array([0, 1], dtype=int64)

In [22]:
lab.inverse_transform([1, 1, 0])

array(['Paul', 'Paul', 'George'], dtype=object)

In [23]:
X_cat.name.astype(
    "category"
).cat.as_ordered().cat.codes + 1

0    1
1    2
dtype: int8

## Frequency encoding

In [24]:
mapping = X_cat.name.value_counts()
X_cat.name.map(mapping)

0    1
1    1
Name: name, dtype: int64

In [None]:
## Pulling categories from strings

In [25]:
from collections import Counter
c = Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i : i + 3]] += 1
df.name.apply(triples)
c.most_common(10)

[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

In [26]:
## Regular expression

In [27]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [28]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Major         2
Ms            2
Mlle          2
Don           1
Jonkheer      1
Lady          1
Dona          1
Countess      1
Capt          1
Sir           1
Mme           1
Name: name, dtype: int64

## Other types of encoders

## Date feature engineering

## Add col-na feature

In [34]:
from pandas.api.types import is_numeric_dtype
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (
            name in na_dict
        ):
            df[name + "_na"] = pd.isnull(col)
            filler = (
                na_dict[name]
                if name in na_dict
                else col.median()
            )
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
data = pd.DataFrame({"A": [0, None, 5, 100]})
fix_missing(data, data.A, "A", {})
data

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


In [None]:
## Mannaul feature engineering

In [35]:
data = pd.DataFrame({"A": [0, None, 5, 100]})
data["A_na"] = data.A.isnull()
data["A"] = data.A.fillna(data.A.median())

In [36]:
data

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


In [37]:
agg = (
    df.groupby("cabin")
    .agg("min,max,mean,sum".split(","))
    .reset_index()
)
agg.columns = [
    "_".join(c).strip("_")
    for c in agg.columns.values
]
agg_df = df.merge(agg, on="cabin")

In [39]:
agg_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,parch_mean,parch_sum,fare_min,fare_max,fare_mean,fare_sum,body_min,body_max,body_mean,body_sum
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,...,0.5,1,211.3375,211.3375,211.3375,422.675,,,,0.0
1,1,1,"Madill, Miss. Georgette Alexandra",female,15.0,0,1,24160,211.3375,B5,...,0.5,1,211.3375,211.3375,211.3375,422.675,,,,0.0
2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0


In [41]:
agg_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest', 'pclass_min',
       'pclass_max', 'pclass_mean', 'pclass_sum', 'survived_min',
       'survived_max', 'survived_mean', 'survived_sum', 'age_min', 'age_max',
       'age_mean', 'age_sum', 'sibsp_min', 'sibsp_max', 'sibsp_mean',
       'sibsp_sum', 'parch_min', 'parch_max', 'parch_mean', 'parch_sum',
       'fare_min', 'fare_max', 'fare_mean', 'fare_sum', 'body_min', 'body_max',
       'body_mean', 'body_sum'],
      dtype='object')

In [42]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [43]:
# completely not proper here