In [20]:
import csv
import numpy as np

filename = "data/train.csv"

with open(filename,'r') as csvfile:
    # Set the right quote character and delimiter
    csv_contents = [{k: v for k, v in row.items()}
        for row in csv.DictReader(csvfile, skipinitialspace=True, quotechar='"', delimiter=',')]

features = []
targets = []

for row in csv_contents:
    targets.append(row.pop('Survived', None))
    features.append(row)
    
filename = "data/test.csv"

with open(filename,'r') as csvfile:
    # Set the right quote character and delimiter
    csv_contents = [{k: v for k, v in row.items()}
        for row in csv.DictReader(csvfile, skipinitialspace=True, quotechar='"', delimiter=',')]

test_features = []

for row in csv_contents:
    test_features.append(row)
    

[{'PassengerId': '1', 'Pclass': '3', 'Sex': 'male', 'Age': '22', 'SibSp': '1', 'Parch': '0', 'Fare': '7.25', 'Cabin': '', 'Embarked': 'S'}, {'PassengerId': '2', 'Pclass': '1', 'Sex': 'female', 'Age': '38', 'SibSp': '1', 'Parch': '0', 'Fare': '71.2833', 'Cabin': 'C85', 'Embarked': 'C'}, {'PassengerId': '3', 'Pclass': '3', 'Sex': 'female', 'Age': '26', 'SibSp': '0', 'Parch': '0', 'Fare': '7.925', 'Cabin': '', 'Embarked': 'S'}, {'PassengerId': '4', 'Pclass': '1', 'Sex': 'female', 'Age': '35', 'SibSp': '1', 'Parch': '0', 'Fare': '53.1', 'Cabin': 'C123', 'Embarked': 'S'}, {'PassengerId': '5', 'Pclass': '3', 'Sex': 'male', 'Age': '35', 'SibSp': '0', 'Parch': '0', 'Fare': '8.05', 'Cabin': '', 'Embarked': 'S'}, {'PassengerId': '6', 'Pclass': '3', 'Sex': 'male', 'Age': '', 'SibSp': '0', 'Parch': '0', 'Fare': '8.4583', 'Cabin': '', 'Embarked': 'Q'}, {'PassengerId': '7', 'Pclass': '1', 'Sex': 'male', 'Age': '54', 'SibSp': '0', 'Parch': '0', 'Fare': '51.8625', 'Cabin': 'E46', 'Embarked': 'S'}, {'P

In [26]:
feature_to_index = dict()
index_to_feature = dict()

target_to_index = dict()
index_to_target = dict()

def map_features(features):
    index = 0
    for item in features:
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                if not k in feature_to_index:
                    feature_to_index[k] = index
                    index_to_feature[index] = k
                    index += 1
            else:
                unique = "=".join([k,v])
                if not unique in feature_to_index:
                    feature_to_index[unique] = index
                    index_to_feature[index] = unique
                    index += 1
                
def map_targets(targets):
    index = 0
    for item in targets:
        unique = item
        if not unique in target_to_index:
            target_to_index[unique] = index
            index_to_target[index] = unique
            index += 1
                
def transform_features(features):
    size = len(feature_to_index)
    result = []
    for item in features:
        feature = np.zeros(size)
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                feature[feature_to_index[k]] = v
            else:
                unique = "=".join([k,v])
                feature[feature_to_index[unique]] = 1
        result.append(feature.tolist())
    return result

def transform_targets(targets):
    result = []
    for item in targets:
        unique = item
        result.append(target_to_index[unique])
    return result

map_features(features+test_features)
map_targets(targets)

transformed_features = transform_features(features)
transformed_targets = transform_targets(targets)

In [22]:
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

parameters = {'max_depth':list(range(1, 12)), 'min_samples_leaf':list(range(1, 12)), 'min_samples_split':list(range(2,12)),'criterion': ('entropy', 'gini')}
tree_classifier = tree.DecisionTreeClassifier()
kf = KFold(n_splits=10)
kf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(tree_classifier, parameters, cv=kf, return_train_score=False)
clf.fit(transformed_features, transformed_targets)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'criterion': ('entropy', 'gini')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [23]:
print(clf.best_score_)
print(clf.best_params_)

0.8159371492704826
{'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 8}


In [24]:
import graphviz
dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None, 
                         feature_names=index_to_feature,
                   class_names=index_to_target,
                               filled=True)  
graph = graphviz.Source(dot_data) 
graph.render(filename="tree") 

'tree.pdf'

In [25]:
from sklearn import naive_bayes
from sklearn.model_selection import cross_validate

clf_tree = clf.best_estimator_
cv_results_tree = cross_validate(clf_tree, transformed_features, transformed_targets, cv=kf, return_train_score=False)

clf_bayes = naive_bayes.GaussianNB()
cv_results_bayes = cross_validate(clf_bayes, transformed_features, transformed_targets, cv=kf, return_train_score=False)

print(cv_results_tree["test_score"])
print("Average accuracy: {}".format(sum(cv_results_tree["test_score"]) / float(len(cv_results_tree["test_score"]))))
print(cv_results_bayes["test_score"])
print("Average accuracy: {}".format(sum(cv_results_bayes["test_score"]) / float(len(cv_results_bayes["test_score"]))))

[0.73333333 0.77777778 0.73033708 0.85393258 0.85393258 0.83146067
 0.82022472 0.78651685 0.87640449 0.82954545]
Average accuracy: 0.8093465554420611
[0.63333333 0.62222222 0.65168539 0.65168539 0.65168539 0.6741573
 0.64044944 0.69662921 0.73033708 0.68181818]
Average accuracy: 0.6634002950856883


In [28]:

transformed_test_features = transform_features(test_features)

In [32]:
print(clf_tree.predict(transformed_test_features))
print(transformed_test_features)

[0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0
 1 0 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1
 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0
 1 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0
 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0
 0 1 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0
 0 1 0 1 1 1 0 1 0 0 1]
[[892.0, 3.0, 1.0, 34.5, 0.0, 0.0, 7.8292, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0