In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [4]:
df = pd.read_csv('trainData-random.csv')[['Age','Pclass','SibSp','Parch','Survived']]

In [7]:
df.shape

(714, 5)

In [8]:
df.sample(5)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
248,37.0,1,1,1,1
183,1.0,2,2,1,1
853,16.0,1,0,1,1
628,26.0,3,0,0,0
714,52.0,2,0,0,0


##### Dropping missing data.

In [14]:
df.dropna(inplace=True)

In [16]:
X=df.iloc[:,0:4]
y=df.iloc[:,-1]

In [17]:
X.shape,y.shape

((714, 4), (714,))

In [18]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


##### Checking the score of model without feature construction.

In [19]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6933333333333332

### Applying feature construction.

##### Adding 1 coz we will also count the passenger in the family.

In [24]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [25]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [26]:
def myfunc(num):
    if num == 1:
        #alone
        return 0
    elif num >1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [27]:
myfunc(4)

1

##### We can apply the above func using apply() and passing the func as arg.

In [28]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [29]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [30]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [31]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [32]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.7003174603174602

##### As we can see the accuracy score has increased with feature construction.

### Feature Splitting

In [33]:
df = pd.read_csv('trainData-random.csv')

In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
