# Feature Construction and Feature Splitting

## FEATURE CONSTRUCTION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('train.csv')[['Age', 'Pclass', 'SibSp', 'Parch', 'Survived']]

In [3]:
df.sample(8)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
242,29.0,2,0,0,0
183,1.0,2,2,1,1
129,45.0,3,0,0,0
251,29.0,3,1,1,0
517,,3,0,0,0
889,26.0,1,0,0,1
721,17.0,3,1,0,0
821,27.0,3,0,0,1


In [4]:
df.dropna(inplace= True)
df.sample(8)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
499,24.0,3,0,0,0
79,30.0,3,0,0,1
123,32.5,2,0,0,1
275,63.0,1,1,0,1
630,80.0,1,0,0,1
404,20.0,3,0,0,0
81,29.0,3,0,0,1
800,34.0,2,0,0,0


In [10]:
x = df.drop(columns = 'Survived' )
y = df['Survived']

In [12]:
np.mean(cross_val_score(LogisticRegression(), x,y, scoring = 'accuracy', cv = 20))

0.6933333333333332

## Applying Feature Construction

In [24]:
x['Family_size'] = x['SibSp'] + x['Parch'] + 1
x

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,0
1,38.0,1,1,0,2,0
2,26.0,3,0,0,1,2
3,35.0,1,1,0,2,0
4,35.0,3,0,0,1,2
...,...,...,...,...,...,...
885,39.0,3,0,5,6,2
886,27.0,2,0,0,1,2
887,19.0,1,0,0,1,2
889,26.0,1,0,0,1,2


## Define a new function 

In [25]:
def myfunc(num):
    if num == 1:
        #Travelling Alone
        return 0
    elif num > 1 and num <=4:
        #Small Family
        return 1
    else:
        #Large Family
        return 2

In [26]:
myfunc(3)

1

## Applying the new function

In [27]:
x['Family_type'] = x['Family_size'].apply(myfunc)

In [28]:
x

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0
...,...,...,...,...,...,...
885,39.0,3,0,5,6,2
886,27.0,2,0,0,1,0
887,19.0,1,0,0,1,0
889,26.0,1,0,0,1,0


In [29]:
x.drop(columns = ['SibSp', 'Parch', 'Family_size'])

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0
...,...,...,...
885,39.0,3,2
886,27.0,2,0
887,19.0,1,0
889,26.0,1,0


## Reviewing the Accuracy after Feature Construction

In [30]:
np.mean(cross_val_score(LogisticRegression(), x, y, scoring = 'accuracy', cv = 20))

0.7031746031746031

## FEATURE SPLITTING

In [32]:
df1 = pd.read_csv('train.csv')
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
 df1['Title'] = df1['Name'].str.split(', ', expand= True)[1].str.split('.', expand = True)[0]
df1['Title']

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Title, Length: 891, dtype: object

In [37]:
df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr
