## Apply function 
Subset rows or columns of dataframe according to labels in the specified index.

Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.

In [105]:
import pandas as pd
import numpy as np

In [106]:
data = pd.read_csv('data/train.csv')

In [107]:
data.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [108]:
# let's use apply function to get the length of names
data["Name_length"] = data.Name.apply(len)

In [109]:
data.loc[0:5, ["Name", "Name_length"]]

Unnamed: 0,Name,Name_length
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24
5,"Moran, Mr. James",16


In [110]:
# let's get the mean price on fare column
data["Fare_mean"] = data.Fare.apply(np.mean)

In [111]:
data.loc[0:5, ["Fare", "Fare_mean"]]

Unnamed: 0,Fare,Fare_mean
0,7.25,7.25
1,71.2833,71.2833
2,7.925,7.925
3,53.1,53.1
4,8.05,8.05
5,8.4583,8.4583


In [112]:
data.Name.str.split('.')[0][0].split(',')[1]

' Mr'

In [113]:
# let's get the name perfix, like Mr. Mrs. Mss. Ms...
data['prefix'] = data.Name.str.split('.').apply(lambda x: x[0].split(',')[1])

In [114]:
data.loc[0:10, ['Name', 'prefix']]

Unnamed: 0,Name,prefix
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr
5,"Moran, Mr. James",Mr
6,"McCarthy, Mr. Timothy J",Mr
7,"Palsson, Master. Gosta Leonard",Master
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Mrs
9,"Nasser, Mrs. Nicholas (Adele Achem)",Mrs


In [115]:
del data['dummy_prefix']

KeyError: 'dummy_prefix'

In [117]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Fare_mean,prefix
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,21,13.0,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,28,30.0,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,40,23.45,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,21,30.0,Mr
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q,19,7.75,Mr


In [116]:
# let's get the unique prefix
data['prefix'].unique()

array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
       ' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer'], dtype=object)

In [118]:
# let's use apply function to combined prefixes, 
# male = Mr Master, Don, rev, Dr, sir, col, capt, == 0
# female = Mrs miss, Mme, Ms, Lady, Mlle, the countess,Jonkheer  == 1

In [119]:
dummy_pre = data.groupby('prefix')

In [120]:
#list(data.groupby('prefix'))

In [121]:
dummy_pre.count()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Fare_mean
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Capt,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Col,2,2,2,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
Don,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
Dr,7,7,7,7,7.0,6.0,7.0,7.0,7.0,7.0,3.0,7.0,7.0,7.0
Jonkheer,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
Lady,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Major,2,2,2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Master,40,40,40,40,40.0,36.0,40.0,40.0,40.0,40.0,7.0,40.0,40.0,40.0
Miss,182,182,182,182,182.0,146.0,182.0,182.0,182.0,182.0,47.0,181.0,182.0,182.0
Mlle,2,2,2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [122]:
get_dummy = data.prefix

In [126]:
pd.get_dummies(data['prefix'])

Unnamed: 0,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [125]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Fare_mean,prefix
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,7.25,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,71.2833,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,7.925,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,53.1,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,8.05,Mr
