In [4]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

## 1. Data retrieval

In [5]:
DATASETS_DIR = 'datasets/' 
RETRIEVED_DATA = 'raw-data.csv'

def data_retrieval(url):
     
    # Loading data from specific url
    data = pd.read_csv(url)
    
    # Uncovering missing data
    data.replace('?', np.nan, inplace=True)
    data['age'] = data['age'].astype('float')
    data['fare'] = data['fare'].astype('float')
    
    # helper function 1
    def get_first_cabin(row):
        try:
            return row.split()[0]
        except:
            return np.nan
    
    # helper function 2
    def get_title(passenger):
        line = passenger
        if re.search('Mrs', line):
            return 'Mrs'
        elif re.search('Mr', line):
            return 'Mr'
        elif re.search('Miss', line):
            return 'Miss'
        elif re.search('Master', line):
            return 'Master'
        else:
            return 'Other'
    
    # Keep only one cabin | Extract the title from 'name'
    data['cabin'] = data['cabin'].apply(get_first_cabin)
    data['title'] = data['name'].apply(get_title)
    
    # Droping irrelevant columns
    DROP_COLS = ['boat','body','home.dest','ticket','name']
    data.drop(DROP_COLS, 1, inplace=True)
    
    data.to_csv(DATASETS_DIR + RETRIEVED_DATA, index=False)
    
    return print('Data stored in {}'.format(DATASETS_DIR + RETRIEVED_DATA))

In [6]:
URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
data_retrieval(URL)

Data stored in datasets/raw-data.csv




In [7]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)
df.shape

(1309, 10)

In [8]:
df.sample(5)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title
657,3,1,female,0.75,2,1,19.2583,,C,Miss
281,1,1,female,43.0,1,0,55.4417,C116,C,Mrs
1186,3,0,male,,2,0,21.6792,,C,Mr
434,2,1,female,7.0,0,2,26.25,,S,Miss
465,2,1,female,23.0,0,0,13.7917,D,C,Mrs


In [9]:
df.head(5)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title
0,1,1,female,29.0,0,0,211.3375,B5,S,Miss
1,1,1,male,0.9167,1,2,151.55,C22,S,Master
2,1,0,female,2.0,1,2,151.55,C22,S,Miss
3,1,0,male,30.0,1,2,151.55,C22,S,Mr
4,1,0,female,25.0,1,2,151.55,C22,S,Mrs


## 2. Train-test split

In [10]:
SEED_SPLIT = 404

X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop('survived', axis=1),
                                                        df['survived'],
                                                        test_size=0.2,
                                                        random_state=SEED_SPLIT
                                                   )

TRAIN_DATA_FILE = DATASETS_DIR + 'train.csv'
TEST_DATA_FILE  = DATASETS_DIR + 'test.csv'

X_train.to_csv(TRAIN_DATA_FILE, index=False)
X_test.to_csv(TEST_DATA_FILE, index=False)

X_train.shape, X_test.shape

((1047, 9), (262, 9))

In [11]:
target = 'survived'
num_vars = [col for col in X_train.columns if X_train[col].dtype != object and col != target]
cat_vars = [col for col in X_train.columns if X_train[col].dtype == object]

In [12]:
num_vars

['pclass', 'age', 'sibsp', 'parch', 'fare']

In [13]:
target

'survived'

In [14]:
# Validation step
len(num_vars) + len(cat_vars) + 1 == df.shape[1]

True

## 3. Feature engineering

### 3.1. Without persisting information

**Numerical variables**

- Create missing value indicator: only for numeric variables

In [15]:
def missing_indicator(data, col_name):
    data[col_name+'_nan'] = data[col_name].isnull().astype(int)
    return None

In [16]:
for var in num_vars:
    print(missing_indicator(X_train, var))
    print(missing_indicator(X_test, var))

None
None
None
None
None
None
None
None
None
None


In [17]:
X_train.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,title,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan
1162,3,male,,0,0,7.75,,Q,Mr,0,1,0,0,0
899,3,female,27.0,0,2,11.1333,,S,Mrs,0,0,0,0,0


**Categorical variables**

- Keep only the letter in cabin
- Fill NaN with label "missing"

In [18]:
def extract_letter_from_cabin(x):
    if type(x)==str:    
        return ''.join(re.findall("[a-zA-Z]+", x))  
    else: 
        return x

X_train['cabin'] = X_train['cabin'].apply(extract_letter_from_cabin)    
X_test['cabin'] = X_test['cabin'].apply(extract_letter_from_cabin)    

In [19]:
X_train['cabin'].unique(), X_test['cabin'].unique()

(array([nan, 'C', 'E', 'B', 'F', 'D', 'A', 'G'], dtype=object),
 array(['D', nan, 'A', 'B', 'C', 'E', 'G', 'T', 'F'], dtype=object))

In [20]:
X_train[cat_vars] = X_train[cat_vars].fillna('missing')
X_test[cat_vars]  = X_test[cat_vars].fillna('missing')

### 3.2. With persisting information

**Numerical variables**

- Fill NaN with median

In [21]:
imp_median = SimpleImputer(strategy='median')
imp_median.fit(X_train[num_vars])

SimpleImputer(strategy='median')

In [22]:
imp_median.statistics_

array([ 3.    , 28.    ,  0.    ,  0.    , 14.4542])

In [23]:
X_train[num_vars] = imp_median.transform(X_train[num_vars])
X_test[num_vars]  = imp_median.transform(X_test[num_vars])

**Categorical variables**

- Remove rare labels
- One hot encoding
- Fix one-hot-encoded features not in test set

In [24]:
def find_rare_labels(data, col, perc):
    data = data.copy()
    tmp = data.groupby(col)[col].count() / data.shape[0]
    return tmp[tmp < perc].index

rare_labels_ = {}
for col in cat_vars:
    rare_labels_[col] = find_rare_labels(X_train, col, 0.05)
    
for col in cat_vars:
    X_train[col] = np.where(X_train[col].isin(rare_labels_[col]), 'Rare', X_train[col])
    X_test[col]  = np.where(X_test[col].isin(rare_labels_[col]), 'Rare', X_test[col])

In [25]:
X_train[cat_vars[1]].unique()

array(['missing', 'C', 'Rare', 'B'], dtype=object)

In [26]:
X_train = pd.concat([X_train, pd.get_dummies(X_train[cat_vars], drop_first=True)], 1)
X_test  = pd.concat([X_test, pd.get_dummies(X_test[cat_vars], drop_first=True)], 1)

X_train.drop(cat_vars, 1, inplace=True)
X_test.drop(cat_vars, 1, inplace=True)

  """Entry point for launching an IPython kernel.
  
  after removing the cwd from sys.path.
  """


In [27]:
# Validation step
set(X_train.columns).difference(set(X_test.columns))

{'embarked_Rare'}

In [28]:
for col in list(set(X_train.columns).difference(set(X_test.columns))):
    X_test[col] = 0

**Aligning columns of X_train and X_test**

In [29]:
ordered_vars = [col for col in X_train.columns]

X_train = X_train[ordered_vars]
X_test  = X_test[ordered_vars]

**Scaling**

In [30]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

## 4. Training model

In [31]:
SEED_MODEL = 404

model = LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL)
model.fit(X_train, y_train)

LogisticRegression(C=0.0005, class_weight='balanced', random_state=404)

In [32]:
for s,t in zip(['train','test'],[(X_train, y_train),(X_test,y_test)]):
    x,y = t[0], t[1]
    class_pred = model.predict(x)
    proba_pred = model.predict_proba(x)[:,1]
    print('{} roc-auc : {}'.format(s, roc_auc_score(y, proba_pred)))
    print('{} accuracy: {}'.format(s, accuracy_score(y, class_pred)))
    print()

train roc-auc : 0.8470412710714978
train accuracy: 0.7831900668576887

test roc-auc : 0.8163583073823043
test accuracy: 0.7748091603053435



In [33]:
tmp = pd.DataFrame(X_test, columns=ordered_vars)
tmp['y_true'] = np.array(y_test)
tmp['y_pred'] = model.predict(X_test)
tmp['proba_pred'] = model.predict_proba(X_test)[:,1]

tmp.head(10)

Unnamed: 0,pclass,age,sibsp,parch,fare,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan,...,cabin_missing,embarked_Q,embarked_Rare,embarked_S,title_Mr,title_Mrs,title_Rare,y_true,y_pred,proba_pred
0,0.0,0.724426,0.0,0.222222,0.221098,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0.502177
1,0.5,0.386221,0.125,0.111111,0.051237,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.481497
2,1.0,0.223382,0.0,0.0,0.015379,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0.513358
3,0.5,0.423799,0.125,0.0,0.040989,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.481422
4,0.5,0.48643,0.0,0.0,0.050749,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.481452
5,1.0,0.298538,0.0,0.0,0.01394,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0,0.47703
6,0.5,0.160751,0.0,0.111111,0.038061,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.514231
7,0.0,0.611691,0.125,0.0,0.111118,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0.501921
8,0.0,0.398747,0.0,0.0,0.148911,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.534687
9,0.0,0.26096,0.25,0.222222,0.512122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.531581
