In [34]:
import pandas as pd

In [35]:
# start with the original titanic (the preprocessed one works with KNIME, but we need different changes for Python)
titanic=pd.read_csv('titanic_data.csv')
print(titanic.shape)
print(titanic.head())

(1309, 14)
   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

     age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.00      0      0   24160  211.3375       B5        S    2    NaN   
1   0.92      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.00      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St L

In [36]:
# get rid of NaN
titanic=titanic.fillna(0)
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,0.0,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11,0.0,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0,0.0,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0,0.0,"Montreal, PQ / Chesterville, ON"


In [37]:
# one quick trick is to print out the value cabin in the function... this could cause a problem for super large data,
# but can also help debug
def convertCabin(cabin):
    print(cabin)
    if 'C' in cabin:
        return 1
    else: 
        return 2

In [39]:
# apply function again... note it fails at the value 0... that's because 0 is a number not a string
test=titanic.cabin.apply(lambda x: convertCabin(x))
test.head()

B5
C22 C26
C22 C26
C22 C26
C22 C26
E12
D7
A36
C101
0


TypeError: argument of type 'int' is not iterable

In [40]:
# let's rewrite the function, but first force 'cabin' to always be a string
def convertCabin(cabin):
    cabin=str(cabin)
    if 'C' in cabin:
        return 1
    else: 
        return 2

In [41]:
# apply function again... this time it seems to work
titanic['cabin1']=titanic.cabin.apply(lambda x: convertCabin(x))
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cabin1
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,0.0,"St Louis, MO",2
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11,0.0,"Montreal, PQ / Chesterville, ON",1
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0,0.0,"Montreal, PQ / Chesterville, ON",1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0,135.0,"Montreal, PQ / Chesterville, ON",1
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0,0.0,"Montreal, PQ / Chesterville, ON",1


In [42]:
# also let's convert gender to a number
def convertGender(gender):
    gender=str(gender)
    if (gender=='male'):
        return 1
    else:
        return 0

In [43]:
titanic['gender']=titanic.sex.apply(lambda x: convertGender(x))

In [44]:
# now let's check the 'type' of each column to make sure the ones we converted are recognized as numeric
# (numeric is ints, floats, doubles, non-numeric is ussually object or String)
for c in titanic.columns:
    print (c,titanic[c].dtype)

pclass int64
survived int64
name object
sex object
age float64
sibsp int64
parch int64
ticket object
fare float64
cabin object
embarked object
boat object
body float64
home.dest object
cabin1 int64
gender int64


In [46]:
# make a feature vector out of some numeric columns (doesn't have to be the ones I chose. 
# better results may obtainable with another combination 
x=titanic[['fare','age','cabin1','gender']]
# the 'y' (class) vector is the 'survived' column
y=titanic['survived']
x.head()

Unnamed: 0,fare,age,cabin1,gender
0,211.3375,29.0,2,0
1,151.55,0.92,1,1
2,151.55,2.0,1,0
3,151.55,30.0,1,1
4,151.55,25.0,1,0


In [47]:
# now let's convert x and y to 'numpy' matrices and from there we can do machine learning
x=x.values
y=y.values

In [49]:
type(y)

numpy.ndarray

In [18]:
y.shape

(1309,)

In [15]:
def whereDidTheyEmbark(embark):
    if (embark=='S'):
        return 1;
    elif (embark=='C'):
        return 2
    elif (embark=='Q'):
        return 3
    else:
        return 0
    
    

In [84]:
whereDidTheyEmbark(45)

0