In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
test_import = pd.read_csv('./test.csv')
train_import = pd.read_csv('./train.csv')

test = np.zeros((0,0))
train = np.zeros((0,0))

In [4]:
data_clean = [test_import, train_import]

In [5]:
for dataset in data_clean:
    print(dataset.isnull().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool
PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool


In [6]:
for dataset in data_clean:
    # datasets are missing data in Age, Fare, Cabin and Embarked
    # inspect the data and clean up the data. Start with first column and work our way up
    # Age and Fare we can fill by taking the median 
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

    # Embarked and Cabin we can fill with mode (most common)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Cabin'].fillna(dataset['Cabin'].mode()[0], inplace = True)

In [7]:
for dataset in data_clean:
    # now that the data is filled we should see if there are columns that are irrelevant
    # we can drop passengerId as its an index, also cabin and ticket are irrelevant combined with the other features that we already
    # have such as fare.
    dataset.drop(['PassengerId','Cabin', 'Ticket'], axis=1, inplace=True)

In [9]:
for dataset in data_clean:
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1
    
    # set isAlone to 0 
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 

    # split title from remainder of the name
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    
    # group non common titles
    title_names = (dataset['Title'].value_counts() < 10)
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
    
    
    # group some features down into groups
    # create 4 different fare groups
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.qcut(dataset['Age'], 4)

In [10]:
label = LabelEncoder()

for dataset in data_clean:    
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

    
    featuresTest = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
    featuresTrain = ['Survived','Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
    test = pd.get_dummies(data_clean[0][featuresTest])
    train = pd.get_dummies(data_clean[1][featuresTrain])

In [11]:
X_train = train.drop('Survived', axis=1)
Y_train = train['Survived'];

In [12]:
svc=make_pipeline(StandardScaler(),SVC(random_state=1))
r=[0.0001,0.001,0.1,1,10,50,100]
PSVM=[{'svc__C':r, 'svc__kernel':['linear']},
      {'svc__C':r, 'svc__gamma':r, 'svc__kernel':['rbf']}]
GSSVM=GridSearchCV(estimator=svc, param_grid=PSVM, scoring='accuracy', cv=2)
scores_svm=cross_val_score(GSSVM, X_train.astype(float), Y_train,scoring='accuracy', cv=5)
print(np.mean(scores_svm))

0.8215849991506451


In [14]:
# import a fresh set to get the ids from
fresh_import = pd.read_csv('./test.csv')

#Fit the model
GSSVM.fit(X_train, Y_train)
pred=GSSVM.predict(test)
output=pd.DataFrame({'PassengerId':fresh_import['PassengerId'],'Survived':pred})
output.to_csv('submission.csv', index=False)