In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesRegressor

In [3]:
print ('Reading Data...')
train_data = pd.read_csv('../input/train.csv',dtype = {'Age':np.float64},)
test_data = pd.read_csv('../input/test.csv',dtype={'Age':np.float64},)

Reading Data...


In [4]:
print ('Cleaning Data...')

combined2 = pd.concat([train_data,test_data],axis = 0)
combined2.Embarked.fillna('S',inplace = True)

combined2.Fare.fillna(np.median(combined2.Fare[combined2.Fare.notnull()]),inplace = True)

Cleaning Data...


In [5]:
import re
def get_title(name):
    #Use a regular expression to search for a title. Titles always consist of capital and lowercase 
    #letters , and end with a period
    title_search = re.search(' ([A-Za-z]+)\.',name)
    #if the title exsits, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

combined2['Title'] = combined2['Name'].apply(get_title)
title_mapping = {'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,'Major':7,'Col':7,'Mlle':8,
                'Mme':8,'Don':7,'Dona':10,'Lady':10,'Countess':10,'Jonkheer':10,'Sir':7,'Capt':7,'Ms':2}
combined2['TitleCat'] = combined2.loc[:,'Title'].map(title_mapping)

In [6]:
combined2['CabinCat']=pd.Categorical.from_array(combined2.Cabin.fillna('0').apply(lambda x:x[0])).codes
combined2.Cabin.fillna('0',inplace = True)

combined2['EmbarkedCat'] = pd.Categorical.from_array(combined2.Embarked).codes
combined2.drop(['Ticket'],axis = 1,inplace = True)

In [7]:
print ('Consolidating Data...')
full_data = pd.concat([combined2.drop(['Survived'],axis = 1),
                      pd.get_dummies(combined2.Sex,prefix = 'Sex'),
                      combined2.Survived],axis = 1)
#full_data.head()

Consolidating Data...


In [8]:
full_data['FamilySize'] = full_data['SibSp']+full_data['Parch']
full_data['NameLength'] = full_data.Name.apply(lambda x:len(x))

In [9]:

import operator
family_id_mapping = {}
def get_family_id(row):
    last_name = row['Name'].split(',')[0]
    family_id = '{0}{1}'.format(last_name,row['FamilySize'])
    
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            current_id = (max(family_id_mapping.items(),key = operator.itemgetter(1))[1]+1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

family_ids = full_data.apply(get_family_id,axis = 1)
#There are a lot of family ids , so we 'll compress all of the families under 3 members into one code

family_ids[full_data['FamilySize']<3] = -1
full_data["FamilyId"] = family_ids         

In [10]:
###Person Label
child_age = 14
def get_person(passenger):
    age,sex = passenger
    if(age<child_age):
        return 'child'
    elif (sex == 'female'):
        return 'female_adult'
    else:
        return 'male_adult'

full_data = pd.concat([full_data,pd.DataFrame(full_data[['Age','Sex']].apply(get_person,axis = 1),columns=['person'])],axis = 1)
dummies = pd.get_dummies(full_data['person'])
full_data = pd.concat([full_data,dummies],axis = 1)

In [11]:
def process_surname(nm):
    return nm.split(',')[0].lower()

full_data['surname'] = full_data['Name'].apply(process_surname)

In [12]:
###Persihing Females
perishing_female_surnames = list(set(full_data[(full_data.female_adult==1.0) & (full_data.Survived == 0.0) & ((full_data.Parch>0) | (full_data.SibSp>0))]['surname'].values))

def perishing_mother_wife(passenger):
    surname,Pclass,person = passenger
    return 1.0 if (surname in perishing_female_surnames) else 0.0

full_data['perishing_mother_wife'] = full_data[['surname','Pclass','person']].apply(perishing_mother_wife,axis = 1)

###Surviving Males
surviving_male_surnames = list(set(full_data[(full_data.male_adult == 1.0) & 
                                             (full_data.Survived == 1.0) & 
                                            ((full_data.Parch > 0) | (full_data.SibSp>0))]['surname'].values))

def surviving_father_husband(passenger):
    surname,Pclass,person = passenger
    return 1.0 if (surname in surviving_male_surnames) else 0.0

full_data['surviving_father_husband'] = full_data[['surname','Pclass','person']].apply(surviving_father_husband,axis = 1)

full_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,FamilySize,NameLength,FamilyId,person,child,female_adult,male_adult,surname,perishing_mother_wife,surviving_father_husband
0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,1,23,-1,male_adult,0.0,0.0,1.0,braund,0.0,0.0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,1,51,-1,female_adult,0.0,1.0,0.0,cumings,0.0,0.0
2,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,22,-1,female_adult,0.0,1.0,0.0,heikkinen,0.0,0.0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,1,44,-1,female_adult,0.0,1.0,0.0,futrelle,0.0,0.0
4,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,0,24,-1,male_adult,0.0,0.0,1.0,allen,0.0,0.0


In [13]:
#Age Impute Location
print ('Imputing Data...')

classers = ['Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male','EmbarkedCat','FamilySize','NameLength','FamilyId']
age_et = ExtraTreesRegressor(n_estimators = 200)
X_train = full_data.loc[full_data.Age.notnull(),classers]
Y_train = full_data.loc[full_data.Age.notnull(),['Age']]
X_test = full_data.loc[full_data.Age.isnull(),classers]
age_et.fit(X_train,np.ravel(Y_train))
age_preds = age_et.predict(X_test)

full_data.loc[full_data.Age.isnull(),['Age']] =age_preds

Imputing Data...


In [14]:

###Model Build - Random Forest (Categorical Features)
model_dummys = ['Age','male_adult','female_adult','child','perishing_mother_wife','surviving_father_husband','Fare','Parch','Pclass','SibSp',
                 'TitleCat','CabinCat','Sex_female','Sex_male','EmbarkedCat','FamilySize','NameLength','FamilyId']

X_data = full_data.iloc[:891,:]
X_train_1 = X_data.loc[:,model_dummys]

Y_data = full_data.iloc[:891,:]
Y_train_1 = Y_data.loc[:,['Survived']]

#train_valid_split
#from sklearn.cross_validation import train_test_split
#X_train_s,X_valid,Y_train_s,Y_valid = train_test_split(X_train,Y_train,test_size = 0.2,random_state = 1)


In [15]:
X_train_1.head()

Unnamed: 0,Age,male_adult,female_adult,child,perishing_mother_wife,surviving_father_husband,Fare,Parch,Pclass,SibSp,TitleCat,CabinCat,Sex_female,Sex_male,EmbarkedCat,FamilySize,NameLength,FamilyId
0,22.0,1.0,0.0,0.0,0.0,0.0,7.25,0,3,1,1,0,0.0,1.0,2,1,23,-1
1,38.0,0.0,1.0,0.0,0.0,0.0,71.2833,0,1,1,3,3,1.0,0.0,0,1,51,-1
2,26.0,0.0,1.0,0.0,0.0,0.0,7.925,0,3,0,2,0,1.0,0.0,2,0,22,-1
3,35.0,0.0,1.0,0.0,0.0,0.0,53.1,0,1,1,3,3,1.0,0.0,2,1,44,-1
4,35.0,1.0,0.0,0.0,0.0,0.0,8.05,0,3,0,1,0,0.0,1.0,2,0,24,-1


In [None]:
from sklearn.model_selection import GridSearchCV,cross_val_score
print ('Building Model...')
parameters = {'criterion':('entropy','gini'),'max_features':('sqrt','log2',None)}
model_rf = RandomForestClassifier(n_estimators = 30000,min_samples_leaf = 4,class_weight = {0:0.745,1:0.255})
clf = GridSearchCV(estimator = model_rf,param_grid = parameters,scoring='accuracy',cv = 5,n_jobs = -1)

scores = cross_val_score(clf,X_train_1,np.ravel(Y_train_1),scoring = "accuracy",cv = 5)

print ("CV accuracy : %.3f +/- %.3f" % (np.mean(scores),np.std(scores)))

#dtype(X_train)
#X_train_1.describe()
#Y_train_1.describe()
#print len(Y_train)
#clf = clf.fit(X_train_1,Y_train_1['Survived'])

print (clf.best_score_)
print (clf.best_params_)
#model_rf.get_params()


Building Model...


In [58]:
#test
X_t_data = full_data.iloc[891:,:]
X_test = X_t_data.loc[:,model_dummys]

model_rf = RandomForestClassifier(n_estimators = 30000,min_samples_leaf = 4,class_weight = {0:0.745,1:0.255})
model_rf.fit(X_train,np.ravel(Y_train))

print('Generating Predictions...')

model_results = model_rf.predict(X_test)

print('Processing Submission File...')

model_results = [str(int(x)) for x in model_results]
submission = pd.DataFrame()
submission['PassengerId'] = X_t_data.PassengerId
submission['Survived'] = model_results
submission.set_index(['PassengerId'],inplace = True,drop = True)
submission.head(3)
submission.to_csv('titanic_submission_1.csv')

print ('Done.')

Building Model...
Generating Predictions...
Processing Submission File...
Done.


In [59]:
from sklearn.metrics import accuracy_score

Y_train_predict = model_rf.predict(X_train)
random_forest_score = accuracy_score(Y_train,Y_train_predict)

random_forest_score

0.90011223344556679

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit(X_train)
X_train_std = scalar.transform(X_train)
X_test_std = scalar.transform(X_test)

In [62]:
from sklearn.neural_network import MLPClassifier

mlp_classifier = MLPClassifier(solver = 'lbfgs',alpha = 1e-5,hidden_layer_sizes=(15,),random_state = 1)
mlp_classifier.fit(X_train_std,Y_train)

Y_train_predict = mlp_classifier.predict(X_train_std)
mlp_forest_score = accuracy_score(Y_train,Y_train_predict)

#Y_train_predict
mlp_forest_score

  y = column_or_1d(y, warn=True)


0.94500561167227837