# <b style="color:aqua">Feature Engineering</b>
1. Feature Transformation
   - Missing Value Imputation
   - Handling Categorical Features
   - Outlier Detection
   - Feature Scaling
2. Feature Construction
3. Feature Selection
4. Feature Extraction

1. **Feature Construction**
2. **Feature Splitting**
   - Tidy Data : Each row of each column have atomic value. \
     | Mr. Ankit | >--->>> | Mr. | Ankit |

## <b style="color:green"> Feature Construction</b>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("../data/train.csv",)[['Age', 'Pclass', 'SibSp', 'Parch', 'Survived']]
df.sample(4)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
156,16.0,3,0,0,1
500,17.0,3,0,0,0
883,28.0,2,0,0,0
207,26.0,3,0,0,1


In [4]:
df.shape

(891, 5)

In [5]:
df.isnull().sum()

Age         177
Pclass        0
SibSp         0
Parch         0
Survived      0
dtype: int64

In [6]:
df.dropna(inplace=True)
df.shape

(714, 5)

In [7]:
df.isnull().sum()

Age         0
Pclass      0
SibSp       0
Parch       0
Survived    0
dtype: int64

In [8]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [9]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [10]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [11]:
score = np.mean(cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=20))
score*100

69.33333333333333

### **Applying Feature Construction**

In [12]:
# Create a new column using SibSp and Parch column
X['Family_size'] = X['SibSp'] + X['Parch'] + 1
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [13]:
def myfunc(num):
    return (0 if num==1 else 1 if (num>1 and num<=4) else 2)

In [14]:
X['Family_type'] = X['Family_size'].apply(lambda num : 0 if num==1 else 1 if (num>1 and num<=4) else 2)
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [15]:
X.drop(columns=['SibSp', 'Parch', 'Family_size'], inplace=True)
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [16]:
score = np.mean(cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=20))
score*100

70.03174603174602

### **Feature Splitting**

In [17]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [19]:
df[['Title', 'Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [31]:
df['Survived'].unique()

array([0, 1], dtype=int64)

In [33]:
df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [34]:
df['Survived'] = pd.to_numeric(df['Survived'], errors='coerce')

In [38]:
(df.groupby('Title')['Survived'].mean()).sort_values(ascending=False)

Title
the Countess    1.000000
Mlle            1.000000
Sir             1.000000
Ms              1.000000
Lady            1.000000
Mme             1.000000
Mrs             0.792000
Miss            0.697802
Master          0.575000
Col             0.500000
Major           0.500000
Dr              0.428571
Mr              0.156673
Jonkheer        0.000000
Rev             0.000000
Don             0.000000
Capt            0.000000
Name: Survived, dtype: float64

In [41]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] == 1
df['Is_Married']

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    0
890    0
Name: Is_Married, Length: 891, dtype: int64

In [42]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_Married
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0
