### Preprocess Data

In [2]:
import pandas as pd
import numpy as np

In [3]:

X2 = pd.DataFrame(
    {"a":range(5),
     "b": [-100, -50, 0, 200, 1000]   
    })
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


### Standardize

In [4]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit_transform(X2)

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

In [5]:
std.scale_

array([  1.41421356, 407.92156109])

In [6]:
std.mean_

array([  2., 210.])

In [7]:
std.var_

array([2.000e+00, 1.664e+05])

In [9]:
# Pandas version of the StandardScaler
X_std = (X2-X2.mean())/X2.std()
X_std

Unnamed: 0,a,b
0,-1.264911,-0.67972
1,-0.632456,-0.570088
2,0.0,-0.460455
3,0.632456,-0.021926
4,1.264911,1.73219


In [10]:
X_std.mean()

a    4.440892e-17
b    0.000000e+00
dtype: float64

In [11]:
X_std.std()

a    1.0
b    1.0
dtype: float64

### Scale To Range

In [12]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(X2)
mms.transform(X2)

array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

In [13]:
# Pandas version of MinMaxScaler

(X2-X2.min()) / (X2.max()-X2.min())

Unnamed: 0,a,b
0,0.0,0.0
1,0.25,0.045455
2,0.5,0.090909
3,0.75,0.272727
4,1.0,1.0


### Dummy Variables

In [14]:
X_cat = pd.DataFrame(
    {
        "name":["George","Paul"],
        "inst":["Bass","Guitar"]
    })
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [15]:
pd.get_dummies(X_cat, drop_first=True)

Unnamed: 0,name_Paul,inst_Guitar
0,False,False
1,True,True


In [16]:
import janitor as jn

X_cat2 = pd.DataFrame(
    {"A":[1,None,3],
     "names": ["Fred,George",
               "George",
               "John,Paul"]
        
    })
X_cat2

Unnamed: 0,A,names
0,1.0,"Fred,George"
1,,George
2,3.0,"John,Paul"


In [18]:
# Create dummy columns AND split column values by a separator
jn.expand_column(X_cat2, "names", sep=",")

Unnamed: 0,A,names,Fred,George,John,Paul
0,1.0,"Fred,George",1,1,0,0
1,,George,0,1,0,0
2,3.0,"John,Paul",0,0,1,1


### Label Encoder

In [24]:
# Transform a column to a dummy ordinal encoder
from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()
lab.fit_transform(X_cat["inst"])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([0, 1])

In [25]:
# To get the original decoded value using the encoder labels 
lab.inverse_transform([1,1,0])

array(['Guitar', 'Guitar', 'Bass'], dtype=object)

In [27]:
# using pandas to label encode
X_cat.name.astype(
    "category").cat.as_ordered().cat.codes+1

0    1
1    2
dtype: int8

### Frequency Encoding

In [29]:
mapping = X_cat.name.value_counts()
X_cat.name.map(mapping)

0    1
1    1
Name: name, dtype: int64

In [31]:
X_cat.name

0    George
1      Paul
Name: name, dtype: object

### Pulling Categories from Strings

In [33]:
url = ("http://hbiostat.org/data/repo/titanic3.xls")

df = pd.read_excel(url)
orig_df = df

In [34]:
from collections import Counter

c=Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i:i+3]] +=1
        
df.name.apply(triples)
c.most_common(10)


[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

In [39]:
# Find the characters prior to the "." in the field
df.name.str.extract(
    "([A-Za-z]+)\\.", expand=False).head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [40]:
# Find the characters prior to the "." in the field and provide a count of each
df.name.str.extract(
    "([A-Za-z]+)\\.", expand=False).value_counts()

name
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Ms            2
Major         2
Capt          1
Sir           1
Dona          1
Jonkheer      1
Countess      1
Don           1
Mme           1
Lady          1
Name: count, dtype: int64

### Other Categorical Encoding

In [43]:
#pip install category_encoders

In [44]:
import category_encoders as ce
he = ce.HashingEncoder(verbose=1)
he.fit_transform(X_cat)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,1,0,1,0,0
1,0,2,0,0,0,0,0,0


In [45]:
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [48]:
size_df = pd.DataFrame(
        {"name":["Fred","John","Matt"],
         "size":["small","med","xxl"]}
    )
ore = ce.OrdinalEncoder(
        mapping=[
                {"col":"size",
                 "mapping": {
                         "small":1,
                         "med":2,
                         "lg":3
                 }}
        ]
    )
ore.fit_transform(size_df)


Unnamed: 0,name,size
0,Fred,1.0
1,John,2.0
2,Matt,-1.0


### Date Feature Engineering
 * skipped it - talks about using fastai for getting date information pg 86

### Add col_na Feature
 * skipped it - creating a column to fill a missing value and indicate a value was missing pg87
 * I did one of these in one my projects 

### Manual Feature Engineering

In [63]:
agg = df.groupby("cabin").agg({"age":"min,max,mean,sum".split(",")}).reset_index()
agg.columns = ["_".join(c).strip("_") for c in agg.columns.values]
add_df = df.merge(agg, on="cabin")

In [65]:
add_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age_min,age_max,age_mean,age_sum
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",15.0,29.0,22.0,44.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
