import all packages here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

load the titanic3.csv

In [2]:
tic = pd.read_csv('titanic3.csv')

tic.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


look at the info

In [3]:
tic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 14 columns):
pclass       1309 non-null float64
survived     1309 non-null float64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null float64
parch        1309 non-null float64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(7), object(7)
memory usage: 143.4+ KB


convert data types of required columns

In [4]:
tic.survived = tic.survived.astype('category')
tic.sex = tic.sex.astype('category')
tic.pclass = tic.pclass.astype('category')
tic.embarked = tic.embarked.astype('category')

look for missing values

In [5]:
tic.isnull().sum()

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

drop columns not relevant 

In [6]:
tic.drop(['home.dest', 'body', 'boat', 'cabin'], axis = 1, inplace = True)

tic.head(4)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,S


look for missing values and replace them

drop missing in pclass

In [7]:
tic.pclass.value_counts(dropna = False)

 3.0    709
 1.0    323
 2.0    277
NaN       1
Name: pclass, dtype: int64

In [8]:
tic[tic['pclass'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
1309,,,,,,,,,,


In [9]:
tic[1309:]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
1309,,,,,,,,,,


In [10]:
tic.drop(tic.index[1309:], inplace = True)
tic.tail(4)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,C
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5,0.0,0.0,2656,7.225,C
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0,0.0,0.0,2670,7.225,C
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0,0.0,0.0,315082,7.875,S


replace missing in age and fare

In [11]:
# age has too many missing values, replace with mean
print(tic.age.mean())

tic.age = tic.age.fillna(tic.age.mean())

29.8811345124283


In [12]:
# same for fare
tic.fare = tic.fare.fillna(tic.fare.mean())

Replace missing values in embarked with most popular

In [13]:
tic.embarked.value_counts(dropna = False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [14]:
tic.embarked = tic.embarked.fillna('S')

In [15]:
tic.isnull().sum()

pclass      0
survived    0
name        0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

Extract the titles from name and store in title column

In [16]:
tic['title'] = tic.name.str.extract('([A-Za-z]+)\.', expand = False)

In [17]:
tic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,title
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,S,Miss
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,S,Master
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,S,Miss
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,S,Mr
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,S,Mrs


In [18]:
pd.crosstab(tic.title, tic.sex)#, rownames = ['x'], colnames = ['y'])

sex,female,male
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,4
Countess,1,0
Don,0,1
Dona,1,0
Dr,1,7
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,61


Look for the female doctor

In [19]:
#tic[tic['title'] == 'Dr' & tic['sex'] == 'female']

tic[np.logical_and(tic.title == 'Dr', tic.sex == 'female')]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,title
181,1.0,1.0,"Leader, Dr. Alice (Farnham)",female,49.0,0.0,0.0,17465,25.9292,S,Dr


Create “gender_name” variable

In [47]:
tic['gender_name'] = tic['title'].replace(['Capt', 'Col', 'Don', 'Dr',
        'Jonkheer', 'Major', 'Mr', 'Rev', 'Sir'], 'Mr', regex = True)

tic['gender_name'] = tic['gender_name'].replace(['Dona', 'Lady', 'Mme', 'Mrs', 'Countess'], 'Mrs', regex = True)

tic['gender_name'] = tic['gender_name'].replace(['Miss', 'Mlle', 'Ms'], 'Miss', regex = True)

tic['gender_name'] = tic['gender_name'].replace('Master', 'Mast', regex = True)

In [46]:
#def replace_all(text, dic):
 #   for i, j in dic.iteritems():
   #     text = text.replace(i, j)
  #  return text

Change female doctor to Mrs

In [53]:
tic.gender_name[181] = 'Mrs'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [55]:
pd.crosstab(tic.sex, tic.gender_name)

gender_name,Mast,Miss,Mr,Mrs
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0,264,0,202
male,61,0,782,0


Convert gender name to factor

In [56]:
tic.gender_name = tic.gender_name.astype('category')

Are those who pay less than the average for a ticket less likely to survive? Mean fare for each pclass

In [61]:
class1 = tic[tic.pclass == 1]
class2 = tic[tic.pclass == 2]
class3 = tic[tic.pclass == 3]

In [68]:
fare1 = class1.fare.mean()
fare2 = class2.fare.mean()
fare3 = class3.fare.mean()

In [71]:
print(fare1, fare2, fare3)

87.50899164086687 21.1791963898917 13.331086994755056


#Create fare_avg column

In [None]:
tic.fare_avg = tic.pclass

In [130]:
tic.fare_avg = tic.fare_avg.astype(float)

In [141]:
tic.fare_avg[tic.fare_avg == 1] = fare1
tic.fare_avg[tic.fare_avg == 2] = fare2
tic.fare_avg[tic.fare_avg == 3] = fare3

In [144]:
tic.fare_avg.value_counts()

13.331087    709
87.508992    323
21.179196    277
Name: pclass, dtype: int64

Create fare-distance metric for titanic fare-distance = fare - mean(fare of pclass)

In [136]:
tic['fare_distance'] = tic.fare - tic.fare_avg

Add family column

In [97]:
tic['family'] = np.nan