# Feature Extraction

**Import Libraries**

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Binary Feature

**Create new feature called new_cabin_bool**

In [9]:
df['new_cabin_bool'] = df['Cabin'].notnull().astype(int)

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_cabin_bool
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [11]:
df.groupby('new_cabin_bool').agg({'Survived': 'mean'})

Unnamed: 0_level_0,Survived
new_cabin_bool,Unnamed: 1_level_1
0,0.299854
1,0.666667


**Z Test<BR>
H0: There is no statistical difference between the new_cabin=0 and new_cabin=1**

In [25]:
test_stat, p_value = proportions_ztest(count=[df.loc[df['new_cabin_bool'] == 1, 'Survived'].sum(),
                                              df.loc[df['new_cabin_bool'] == 0, 'Survived'].sum()],
                                      nobs=[df.loc[df['new_cabin_bool'] == 1, 'Survived'].shape[0],
                                              df.loc[df['new_cabin_bool'] == 0, 'Survived'].shape[0]])
print('Test Stat = %.4f, p_value = %.4f' % (test_stat, p_value))

Test Stat = 9.4597, p_value = 0.0000


p value is less than 0.05 we reject the H0 hypothesis

**Create new feature called new_is_alone**

In [26]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_cabin_bool
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [27]:
df.loc[(df['SibSp'] + df['Parch']) > 0, 'new_is_alone'] = 'NO'
df.loc[(df['SibSp'] + df['Parch']) == 0, 'new_is_alone'] = 'YES'

In [28]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_cabin_bool,new_is_alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,NO
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,NO
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,YES
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,NO
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,YES


In [29]:
df.groupby('new_is_alone').agg({'Survived': 'mean'})

Unnamed: 0_level_0,Survived
new_is_alone,Unnamed: 1_level_1
NO,0.50565
YES,0.303538


**Z Test<BR>
H0: There is no statistical difference between the new_is_alone='YES' and new_is_alone='NO'**

In [30]:
test_stat, p_value = proportions_ztest(count=[df.loc[df['new_is_alone'] == 'YES', 'Survived'].sum(),
                                              df.loc[df['new_is_alone'] == 'NO', 'Survived'].sum()],
                                      nobs=[df.loc[df['new_is_alone'] == 'YES', 'Survived'].shape[0],
                                              df.loc[df['new_is_alone'] == 'NO', 'Survived'].shape[0]])
print('Test Stat = %.4f, p_value = %.4f' % (test_stat, p_value))

Test Stat = -6.0704, p_value = 0.0000


p value is less than 0.05 we reject the H0 hypothesis

## Text Feature

In [31]:
df = pd.read_csv('titanic.csv')

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Letter Count**

In [33]:
df['new_name_count'] = df['Name'].str.len()

In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_name_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24


**Word Count**

In [35]:
df['new_word_count'] = df['Name'].str.split().str.len()

In [36]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_name_count,new_word_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,7
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4


**Catch the special word like dr**

In [38]:
df['new_name_dr'] = df['Name'].apply(lambda x: len([x for x in x.split() if x.startswith('Dr')]) )

In [39]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_name_count,new_word_count,new_name_dr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,7,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4,0


In [42]:
df.groupby('new_name_dr').agg({'Survived': ['mean', 'count']})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
new_name_dr,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.38252,881
1,0.5,10


## Regex Feature

In [43]:
df['title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [44]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_name_count,new_word_count,new_name_dr,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,7,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4,0,Mr


In [48]:
df[['title', 'Survived', 'Age']].groupby('title').agg({'Survived': ['mean'], 'Age': ['count', 'mean']})

Unnamed: 0_level_0,Survived,Age,Age
Unnamed: 0_level_1,mean,count,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,0.0,1,70.0
Col,0.5,2,58.0
Countess,1.0,1,33.0
Don,0.0,1,40.0
Dr,0.428571,6,42.0
Jonkheer,0.0,1,38.0
Lady,1.0,1,48.0
Major,0.5,2,48.5
Master,0.575,36,4.574167
Miss,0.697802,146,21.773973


## Date Feature

In [95]:
df = pd.read_csv('course_reviews.csv')

In [96]:
df = df[['Rating', 'Timestamp']]
df.head()

Unnamed: 0,Rating,Timestamp
0,5.0,2021-02-05 07:45:55
1,5.0,2021-02-04 21:05:32
2,4.5,2021-02-04 20:34:03
3,5.0,2021-02-04 16:56:28
4,4.0,2021-02-04 15:00:24


In [97]:
df.dtypes

Rating       float64
Timestamp     object
dtype: object

In [98]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [99]:
df.head()

Unnamed: 0,Rating,Timestamp
0,5.0,2021-02-05 07:45:55
1,5.0,2021-02-04 21:05:32
2,4.5,2021-02-04 20:34:03
3,5.0,2021-02-04 16:56:28
4,4.0,2021-02-04 15:00:24


In [100]:
df['year'] = df['Timestamp'].dt.year

In [101]:
df['month'] = df['Timestamp'].dt.month

In [102]:
df['day'] = df['Timestamp'].dt.day

In [103]:
df['day_name'] = df['Timestamp'].dt.day_name()

In [104]:
df.head()

Unnamed: 0,Rating,Timestamp,year,month,day,day_name
0,5.0,2021-02-05 07:45:55,2021,2,5,Friday
1,5.0,2021-02-04 21:05:32,2021,2,4,Thursday
2,4.5,2021-02-04 20:34:03,2021,2,4,Thursday
3,5.0,2021-02-04 16:56:28,2021,2,4,Thursday
4,4.0,2021-02-04 15:00:24,2021,2,4,Thursday


## Feature Interaction

In [105]:
df = pd.read_csv('titanic.csv')

In [106]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [108]:
df.loc[(df['Sex'] == 'male') & (df['Age'] > 50), 'new_age_cat'] = 'senior_male'

In [109]:
df.loc[(df['Sex'] == 'male') & (df['Age'] < 25), 'new_age_cat'] = 'young_male'
df.loc[(df['Sex'] == 'female') & (df['Age'] < 25), 'new_age_cat'] = 'young_female'
df.loc[(df['Sex'] == 'female') & (df['Age'] > 50), 'new_age_cat'] = 'senior_female'

In [110]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_age_cat
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,young_male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [111]:
df.groupby('new_age_cat').agg({'Survived': 'mean'})

Unnamed: 0_level_0,Survived
new_age_cat,Unnamed: 1_level_1
senior_female,0.941176
senior_male,0.12766
young_female,0.726496
young_male,0.204969
