In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv("data/titanic.csv")
titanic["age"] = titanic["age"].replace(['?'], None).astype('float')
titanic["fare"] = titanic["fare"].replace(['?'], None).astype('float')

In [3]:
def years_to_days(yrs):
    return yrs*365

titanic["age"].apply(years_to_days)

0       10585.0000
1         334.5955
2         730.0000
3       10950.0000
4        9125.0000
           ...    
1304     5292.5000
1305     5292.5000
1306     9672.5000
1307     9855.0000
1308    10585.0000
Name: age, Length: 1309, dtype: float64

In [11]:
def get_age_group(age):
    if age < 2:
        return "infant"
    elif age < 12:
        return "child"
    elif age < 18:
        return "teen"
    elif age < 45:
        return "adult"
    else:
        return "senior"

In [12]:
titanic["age"].apply(get_age_group)

0        adult
1       infant
2        child
3        adult
4        adult
         ...  
1304      teen
1305      teen
1306     adult
1307     adult
1308     adult
Name: age, Length: 1309, dtype: object

In [13]:
titanic["age_group"] = titanic["age"].apply(get_age_group)

In [14]:
titanic

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age_group
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO",adult
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON",infant
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON",child
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON",adult
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON",adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,?,C,?,328,?,teen
1305,3,0,"Zabour, Miss. Thamine",female,14.5000,1,0,2665,14.4542,?,C,?,?,?,teen
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,?,C,?,304,?,adult
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,?,C,?,?,?,adult


In [15]:
titanic.age_group.value_counts()

adult     902
senior    219
teen       88
child      75
infant     25
Name: age_group, dtype: int64

In [20]:
titanic.groupby([ "age_group","sex"]).survived.mean()

age_group  sex   
adult      female    0.716612
           male      0.178151
child      female    0.500000
           male      0.435897
infant     female    0.888889
           male      0.562500
senior     female    0.863014
           male      0.143836
teen       female    0.731707
           male      0.170213
Name: survived, dtype: float64

In [22]:
titanic["fare"].apply(lambda x: f"${x*24}")

0                   $5072.1
1       $3637.2000000000003
2       $3637.2000000000003
3       $3637.2000000000003
4       $3637.2000000000003
               ...         
1304              $346.9008
1305              $346.9008
1306    $173.39999999999998
1307    $173.39999999999998
1308                 $189.0
Name: fare, Length: 1309, dtype: object

In [23]:
def convert_currency(num, multiplier):
    return f"${num*multiplier}"

In [24]:
convert_currency(2,23)

'$46'

In [25]:
titanic["fare"].apply(convert_currency, args=(24,))

0                   $5072.1
1       $3637.2000000000003
2       $3637.2000000000003
3       $3637.2000000000003
4       $3637.2000000000003
               ...         
1304              $346.9008
1305              $346.9008
1306    $173.39999999999998
1307    $173.39999999999998
1308                 $189.0
Name: fare, Length: 1309, dtype: object

In [30]:
df = titanic[["pclass", "survived", "age", "fare"]]

In [31]:
df

Unnamed: 0,pclass,survived,age,fare
0,1,1,29.0000,211.3375
1,1,1,0.9167,151.5500
2,1,0,2.0000,151.5500
3,1,0,30.0000,151.5500
4,1,0,25.0000,151.5500
...,...,...,...,...
1304,3,0,14.5000,14.4542
1305,3,0,14.5000,14.4542
1306,3,0,26.5000,7.2250
1307,3,0,27.0000,7.2250


In [32]:
def get_range(s):
    return s.max() - s.min()

In [33]:
df.apply(get_range)

pclass        2.0000
survived      1.0000
age          79.8333
fare        512.3292
dtype: float64

In [34]:
df.apply(get_range, axis=1)

0       210.3375
1       150.6333
2       151.5500
3       151.5500
4       151.5500
          ...   
1304     14.5000
1305     14.5000
1306     26.5000
1307     27.0000
1308     29.0000
Length: 1309, dtype: float64

In [35]:
def get_fam_size(s):
    fam_size = s.sibsp + s.parch
    if fam_size == 0:
        return "solo"
    elif fam_size < 5:
        return "average"
    else:
        return "large"

In [37]:
titanic.apply(get_fam_size, axis=1)

0          solo
1       average
2       average
3       average
4       average
         ...   
1304    average
1305    average
1306       solo
1307       solo
1308       solo
Length: 1309, dtype: object

In [38]:
titanic["fam_size"] = titanic.apply(get_fam_size, axis=1)

In [40]:
titanic["fam_size"].value_counts()

solo       790
average    459
large       60
Name: fam_size, dtype: int64

In [42]:
titanic.groupby("fam_size").survived.mean()

fam_size
average    0.549020
large      0.150000
solo       0.302532
Name: survived, dtype: float64

In [43]:
titanic.groupby(["fam_size", "sex"]).survived.mean()

fam_size  sex   
average   female    0.771429
          male      0.294393
large     female    0.296296
          male      0.030303
solo      female    0.731959
          male      0.162752
Name: survived, dtype: float64

In [44]:
titanic["pclass"].map({1: "1st", 2: "2nd", 3: "3rd"})

0       1st
1       1st
2       1st
3       1st
4       1st
       ... 
1304    3rd
1305    3rd
1306    3rd
1307    3rd
1308    3rd
Name: pclass, Length: 1309, dtype: object

In [45]:
titanic["age"].map(lambda a: a < 18)

0       False
1        True
2        True
3       False
4       False
        ...  
1304     True
1305     True
1306    False
1307    False
1308    False
Name: age, Length: 1309, dtype: bool

In [46]:
titanic[["name", "sex", "age_group"]].applymap(str.upper)

Unnamed: 0,name,sex,age_group
0,"ALLEN, MISS. ELISABETH WALTON",FEMALE,ADULT
1,"ALLISON, MASTER. HUDSON TREVOR",MALE,INFANT
2,"ALLISON, MISS. HELEN LORAINE",FEMALE,CHILD
3,"ALLISON, MR. HUDSON JOSHUA CREIGHTON",MALE,ADULT
4,"ALLISON, MRS. HUDSON J C (BESSIE WALDO DANIELS)",FEMALE,ADULT
...,...,...,...
1304,"ZABOUR, MISS. HILENI",FEMALE,TEEN
1305,"ZABOUR, MISS. THAMINE",FEMALE,TEEN
1306,"ZAKARIAN, MR. MAPRIEDEDER",MALE,ADULT
1307,"ZAKARIAN, MR. ORTIN",MALE,ADULT


In [47]:
titanic[["name", "sex", "age_group"]].applymap(len)

Unnamed: 0,name,sex,age_group
0,29,6,5
1,30,4,6
2,28,6,5
3,36,4,5
4,47,6,5
...,...,...,...
1304,20,6,4
1305,21,6,4
1306,25,4,5
1307,19,4,5
