# Read data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data_orig = pd.read_csv('../data/titanic/train.csv')
train_data_orig['IsTest'] = False
print(len(train_data_orig))

test_data_orig = pd.read_csv('../data/titanic/test.csv')
test_data_orig['IsTest'] = True
print(len(test_data_orig))

891
418


In [3]:
# It's easier to transform both datasets if they are in one dataset
data = pd.concat([train_data_orig,test_data_orig])
data = data.set_index('PassengerId')
print(len(data))

1309


In [4]:
data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsTest
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,False
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,False
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,True
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,True
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,True
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,True


In [5]:
data = data.drop(['Name', 'SibSp', 'Parch', 'Fare', 'Cabin'], axis=1)

In [6]:
data['AgeClass'] = pd.cut(data.Age, 
       [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
       labels=['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89',  '90-99'], 
      )
data = data.drop(['Age'], axis=1)

In [7]:
data

Unnamed: 0_level_0,Survived,Pclass,Sex,Ticket,Embarked,IsTest,AgeClass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,3,male,A/5 21171,S,False,20-29
2,1.0,1,female,PC 17599,C,False,30-39
3,1.0,3,female,STON/O2. 3101282,S,False,20-29
4,1.0,1,female,113803,S,False,30-39
5,0.0,3,male,373450,S,False,30-39
...,...,...,...,...,...,...,...
1305,,3,male,A.5. 3236,S,True,
1306,,1,female,PC 17758,C,True,30-39
1307,,3,male,SOTON/O.Q. 3101262,S,True,30-39
1308,,3,male,359309,S,True,


In [8]:
mapping = {
    'Pclass': 'class',
    'Sex': 'sex',
    'Ticket': 'ticket',
    'Embarked': 'from',
    'AgeClass': 'age'
}

In [9]:
#pd.options.display.max_columns = 999
data_enc = pd.get_dummies(data, prefix=mapping, columns=mapping.keys())
data_enc

Unnamed: 0_level_0,Survived,IsTest,class_1,class_2,class_3,sex_female,sex_male,ticket_110152,ticket_110413,ticket_110465,...,age_0-9,age_10-19,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,age_80-89,age_90-99
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,False,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1.0,False,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,False,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1.0,False,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0.0,False,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,True,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306,,True,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1307,,True,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1308,,True,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
train_df = data_enc[data_enc.IsTest==False].drop(['IsTest'], axis=1)

In [11]:
train_df

Unnamed: 0_level_0,Survived,class_1,class_2,class_3,sex_female,sex_male,ticket_110152,ticket_110413,ticket_110465,ticket_110469,...,age_0-9,age_10-19,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,age_80-89,age_90-99
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1.0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1.0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0.0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0.0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
888,1.0,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
889,0.0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
890,1.0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [12]:
test_df = data_enc[data_enc.IsTest==True].drop(['IsTest','Survived'], axis=1)

In [13]:
test_df

Unnamed: 0_level_0,class_1,class_2,class_3,sex_female,sex_male,ticket_110152,ticket_110413,ticket_110465,ticket_110469,ticket_110489,...,age_0-9,age_10-19,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,age_80-89,age_90-99
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
893,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
894,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
895,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
896,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1307,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1308,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Columns: 948 entries, Survived to age_90-99
dtypes: float64(1), uint8(947)
memory usage: 837.9 KB


In [15]:
from sklearn import svm

In [16]:
clf = svm.SVC()

In [19]:
X = train_df.drop(['Survived'], axis=1)
X

Unnamed: 0_level_0,class_1,class_2,class_3,sex_female,sex_male,ticket_110152,ticket_110413,ticket_110465,ticket_110469,ticket_110489,...,age_0-9,age_10-19,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,age_80-89,age_90-99
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
888,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
889,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
890,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [20]:
y = train_df.Survived
y

PassengerId
1      0.0
2      1.0
3      1.0
4      1.0
5      0.0
      ... 
887    0.0
888    1.0
889    0.0
890    1.0
891    0.0
Name: Survived, Length: 891, dtype: float64

In [23]:
clf.fit(X, y)

SVC()

In [41]:
res = pd.concat([
    pd.DataFrame(test_df.index, columns=['PassengerId']),
    pd.DataFrame(clf.predict(test_df), columns=['Survived'])
    ], axis=1).astype({'Survived':'int'})
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [42]:
res.to_csv('titanic_out.csv', index=False)

In [22]:
?svm.SVC

[0;31mInit signature:[0m
[0msvm[0m[0;34m.[0m[0mSVC[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mC[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel[0m[0;34m=[0m[0;34m'rbf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdegree[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgamma[0m[0;34m=[0m[0;34m'scale'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcoef0[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshrinking[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprobability[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_size[0m[0;34m=[0m[0;36m200[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[

In [29]:
test_df.index

Int64Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
            ...
            1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
           dtype='int64', name='PassengerId', length=418)

In [31]:
?pd.DataFrame

[0;31mInit signature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mCollection[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mCollection[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'ExtensionDtype'[0m[0;34m)[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mnumpy[0m[0;34m.[0m[0mdtype[0m[0;34m,[0m [0mType[0m[0;34m[[0m[0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mint[0m[0;34m,[0m [0mcomplex[0m[0;34m,[0m [0mbool[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNo