## Import Packages

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

**Load dataset**

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = pd.concat([train, test], sort=False)

## Pre-processing

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Information about features
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [5]:
# stattistics desci
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


### Data Types

In [6]:
data.select_dtypes(include = 'object').head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [7]:
data.select_dtypes(include = ['int64','float64']).head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0.0,3,22.0,1,0,7.25
1,2,1.0,1,38.0,1,0,71.2833
2,3,1.0,3,26.0,0,0,7.925
3,4,1.0,1,35.0,1,0,53.1
4,5,0.0,3,35.0,0,0,8.05


### Missing values

In [8]:
data.select_dtypes(include = 'object').isnull().sum()[data.select_dtypes(include = 'object').isnull().sum()>0]

Cabin       1014
Embarked       2
dtype: int64

In [9]:
data.select_dtypes(include = ['int64','float64']).isnull().sum()[data.select_dtypes(include = ['int64','float64']).isnull().sum()>0]

Survived    418
Age         263
Fare          1
dtype: int64

### Imputation

**Fare**

In [10]:
train.Fare = train.Fare.fillna(train.Fare.mean())
test.Fare = test.Fare.fillna(test.Fare.mean())

**Embarked**

In [11]:
train.Embarked = train.Embarked.fillna(train.Embarked.mode()[0])
test.Embarked = test.Embarked.fillna(test.Embarked.mode()[0])

**Cabin**

In [12]:
train["Cabin"].fillna("None", inplace = True) 
test["Cabin"].fillna("None", inplace = True)

**Age**

In [13]:
train['title']=train.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())
test['title']=test.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())

In [14]:
train.title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [15]:
newTitles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"}

In [16]:
train['title'] = train.title.map(newTitles)
test['title'] = test.title.map(newTitles)

In [17]:
train.groupby(['title','Sex']).Age.mean()

title    Sex   
Master   male       4.574167
Miss     female    21.804054
Mr       male      32.368090
Mrs      female    35.718182
Officer  female    49.000000
         male      46.562500
Royalty  female    40.500000
         male      42.333333
Name: Age, dtype: float64

In [18]:
# x = train[train.Age.isnull()][["title","Sex","Age"]]
# y = test[test.Age.isnull()][["title","Sex","Age"]]

In [19]:
# x.drop_duplicates(inplace=True) 
# y.drop_duplicates(inplace=True) 

In [20]:
def newAge (cols):
    title=cols[0]
    Sex=cols[1]
    Age=cols[2]
    if pd.isnull(Age):
        if title=='Master' and Sex=="male":
            return 4.57
        elif title=='Miss' and Sex=='female':
            return 21.8
        elif title=='Mr' and Sex=='male': 
            return 32.37
        elif title=='Mrs' and Sex=='female':
            return 35.72
        elif title=='Officer' and Sex=='female':
            return 49
        elif title=='Officer' and Sex=='male':
            return 46.56
        elif title=='Royalty' and Sex=='female':
            return 40.50
        else:
            return 42.33
    else:
        return Age

In [21]:
train.Age = train[['title','Sex','Age']].apply(newAge, axis=1)
test.Age = test[['title','Sex','Age']].apply(newAge, axis=1)

In [22]:
train.drop(['PassengerId','Name','Ticket','Ticket'],axis=1,inplace=True)
test.drop(['PassengerId','Name','Ticket','Ticket'],axis=1,inplace=True)

In [23]:
Total_df = pd.concat([train, test], sort=False)
Total_df = pd.get_dummies(Total_df)
train = Total_df[:len(train)]
test = Total_df[len(train):]

In [24]:
# convert target feature into int
train.Survived=train.Survived.astype('int64')
train.Survived.dtype # check dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


dtype('int64')

In [25]:
xtrain=train.drop("Survived",axis=1) # drop Srvived feature
ytrain=train['Survived']

In [26]:
# drop Srvived feature
xtest = test.drop("Survived",axis=1)

## Model

**RandomForestClassifier**

In [27]:
#import packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

# model object
RF = RandomForestClassifier(random_state=1)

#parameter for RF
PRF = [{'n_estimators':[10,100],'max_depth':[3,6],'criterion':['gini','entropy']}]

#parameter tunning by GridSearchCV
GSRF = GridSearchCV(estimator=RF, param_grid=PRF, scoring='accuracy',cv=2)

#score with cross validation 5
rf_scores = cross_val_score(GSRF,xtrain,ytrain,scoring='accuracy',cv=5)

In [28]:
#mean of scores
np.mean(rf_scores) 

0.8192873078711266

**SVC from support vector machine**

In [29]:
#import packages
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## model object and Construct a Pipeline from the given estimators
svc = make_pipeline(StandardScaler(),SVC(random_state=1))

#parameter for svc
r=[0.0001,0.001,0.1,1,10,50,100]
PSVM=[{'svc__C':r, 'svc__kernel':['linear']},
      {'svc__C':r, 'svc__gamma':r, 'svc__kernel':['rbf']}]

#parameter tunning by GridSearchCV
GSSVM=GridSearchCV(estimator=svc, param_grid=PSVM, scoring='accuracy', cv=2)

#score with cross validation 5
svm_scores=cross_val_score(GSSVM, xtrain.astype(float), ytrain,scoring='accuracy', cv=5)

In [30]:
#mean of scores
np.mean(svm_scores) 

0.8204865828518253

In [31]:
#survived prediction for submission
model = GSSVM.fit(xtrain, ytrain)
pred = model.predict(xtest)

In [32]:
# for PassengerId
test2 = pd.read_csv("test.csv")

In [33]:
# save submission.csv with PassengerId and Survived
outpu t= pd.DataFrame({'PassengerId':test2['PassengerId'],'Survived':pred})
output.to_csv('submission.csv', index=False)