### Preprocess Data

In [1]:
import pandas as pd
import numpy as np

### Pulling Categories from Strings

In [2]:
url = ("http://hbiostat.org/data/repo/titanic3.xls")

df = pd.read_excel(url)
orig_df = df

In [3]:
from collections import Counter

c=Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i:i+3]] +=1
        
df.name.apply(triples)
c.most_common(10)


[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

In [4]:
# Find the characters prior to the "." in the field
df.name.str.extract(
    "([A-Za-z]+)\\.", expand=False).head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [5]:
# Find the characters prior to the "." in the field and provide a count of each
df.name.str.extract(
    "([A-Za-z]+)\\.", expand=False).value_counts()

name
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Ms            2
Major         2
Capt          1
Sir           1
Dona          1
Jonkheer      1
Countess      1
Don           1
Mme           1
Lady          1
Name: count, dtype: int64

### Manual Feature Engineering

In [25]:
agg = df.groupby("cabin").agg({"age":"min,max,mean,sum".split(",")}).reset_index()
agg.columns = ["_".join(c).strip("_") for c in agg.columns.values]
agg_df = df.merge(agg, on="cabin")

In [26]:
agg_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age_min,age_max,age_mean,age_sum
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",15.0,29.0,22.0,44.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.9167,30.0,14.479175,57.9167


### Collinear columns

In [27]:
agg_df = agg_df.drop(columns=["name",
                      "ticket",
                      "home.dest",
                      "boat",
                      "body",
                      "cabin"])


In [28]:
agg_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,age_min,age_max,age_mean,age_sum
0,1,1,female,29.0,0,0,211.3375,S,15.0,29.0,22.0,44.0
1,1,1,male,0.9167,1,2,151.55,S,0.9167,30.0,14.479175,57.9167
2,1,0,female,2.0,1,2,151.55,S,0.9167,30.0,14.479175,57.9167
3,1,0,male,30.0,1,2,151.55,S,0.9167,30.0,14.479175,57.9167
4,1,0,female,25.0,1,2,151.55,S,0.9167,30.0,14.479175,57.9167


In [29]:
agg_df = pd.get_dummies(agg_df, drop_first=True)

In [30]:
agg_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,age_min,age_max,age_mean,age_sum,sex_male,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,15.0,29.0,22.0,44.0,False,False,True
1,1,1,0.9167,1,2,151.55,0.9167,30.0,14.479175,57.9167,True,False,True
2,1,0,2.0,1,2,151.55,0.9167,30.0,14.479175,57.9167,False,False,True
3,1,0,30.0,1,2,151.55,0.9167,30.0,14.479175,57.9167,True,False,True
4,1,0,25.0,1,2,151.55,0.9167,30.0,14.479175,57.9167,False,False,True


In [31]:
def correlated_columns(df, threshold=0.5):
    return (df.corr()
           .pipe(lambda df1: pd.DataFrame(np.tril(df1, k=-1),
                                         columns=df.columns,
                                         index=df.columns))
            .stack()
            .rename("pearson")
            .pipe(lambda s: s[s.abs()> threshold].reset_index())
            .query("level_0 not in level_1")
           )

correlated_columns(agg_df)

Unnamed: 0,level_0,level_1,pearson
3,age_mean,age,0.862039
4,age_mean,age_min,0.914408
5,age_mean,age_max,0.835706
6,age_sum,sibsp,0.579841
7,age_sum,age_max,0.641381
8,sex_male,survived,-0.581471


In [34]:
limit = 0.5
corr = agg_df.corr()
mask = np.triu(np.ones(corr.shape), k=1).astype(bool)
corr_no_diag = corr.where(mask)
coll = [
    c for c in corr_no_diag.columns
    if any(abs(corr_no_diag[c]) > limit)    
]
coll

['age_min', 'age_max', 'age_mean', 'age_sum', 'sex_male']