# feature construction

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns

In [4]:
df = pd.read_csv('train.csv')[['Age', 'Pclass', 'SibSp','Parch','Survived']]

In [5]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [6]:
df.isnull().mean()*100

Age         19.86532
Pclass       0.00000
SibSp        0.00000
Parch        0.00000
Survived     0.00000
dtype: float64

In [7]:
df.dropna(inplace = True)

In [9]:
# checking model performance before and after feature construction.

In [26]:
X = df.iloc[:, 0:4]
y = df.iloc[:, -1]

In [27]:
X.head(2)

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0


In [28]:
# checking the cross-val-score 
cr_val = np.mean(cross_val_score ( LogisticRegression(), X, y, 
                                  scoring='accuracy', cv = 10))
cr_val

0.6921165884194054

###  Now applying the feature construction

In [29]:
# total size of the family.
X['family_size'] = X['SibSp'] + X['Parch'] + 1
X.head(3)

Unnamed: 0,Age,Pclass,SibSp,Parch,family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1


In [30]:
# custom function for the types of family:
def myfunc(num):
    if num == 1:
        # travelling single.
        return 0
    elif num > 1 and num <=4:
        # small family
        return 1
    else:
        # large family.
        return 2

In [31]:
X['family_type'] = X['family_size'].apply(myfunc)

In [32]:
X.sample(4)

Unnamed: 0,Age,Pclass,SibSp,Parch,family_size,family_type
210,24.0,3,0,0,1,0
426,28.0,2,1,0,2,1
228,18.0,2,0,0,1,0
315,26.0,3,0,0,1,0


In [33]:
# Now we no-longer need those used columns:
X.drop(columns = ['SibSp','Parch','family_size'], inplace=True)
X.head()

Unnamed: 0,Age,Pclass,family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [35]:
# now calculating the cross-val-score:
new_cross_val = np.mean(cross_val_score(LogisticRegression(), X,y, 
                                        scoring='accuracy',cv = 10))
new_cross_val

0.6963810641627542

In [36]:
# little improvement in accuracy::::

# 2. Feature splitting

In [37]:
df = pd.read_csv('train.csv')

In [39]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [41]:
df['Name'].head(3)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
Name: Name, dtype: object

In [43]:
# we need the salutation of each names.
df['title'] = df['Name'].str.split(', ', expand=True)[1]

In [45]:
df['title'] = df['title'].str.split('.', expand=True)[0]

In [46]:
df['title']

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: title, Length: 891, dtype: object

In [59]:
df.groupby('title')['Survived'].mean().sort_values()*100

title
Capt              0.000000
Don               0.000000
Jonkheer          0.000000
Rev               0.000000
Mr               15.667311
Dr               42.857143
Col              50.000000
Major            50.000000
Master           57.500000
Miss             69.780220
Mrs              79.200000
Mme             100.000000
Sir             100.000000
Ms              100.000000
Lady            100.000000
Mlle            100.000000
the Countess    100.000000
Name: Survived, dtype: float64

In [61]:
# for married and un-married people.
df['is_married'] = np.where((df['title'] == 'Mrs'),1,0)
df.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,is_married
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
