In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
from sklearn import tree
from anytree import Node, RenderTree
import random
import math

In [30]:
titanic = pd.read_csv('titanic/combites.csv')

In [31]:
print(titanic.head())

   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Dataset  
0      0         A/5 21171   7.2500   NaN        S   train  
1      0          PC 17599  71.2833   C85        C   train  
2      0  STON/O2. 3101282   7.9250   NaN        S   train  
3      0            113803  53.1000  C123        S   train  
4      0       

In [32]:
def categorize_company(df):
    
    # 0 if alone, 1 if only with siblings/spouse, 2 if only with parent/children, 3 if with both (number of people is disregarded)
    conditions = [(df.Parch + df.SibSp == 0), (df.Parch == 0) & (df.SibSp > 0), (df.Parch > 0) & (df.SibSp ==0),
                  (df.Parch > 0) & (df.SibSp > 0)]
    categories = [0, 1, 2, 3]
    
    # create company attribute
    df["Company"] = np.select(conditions, categories)
    
    # count companions
    df["Companions"] = df.SibSp + df.Parch
    
    # create alone attribute (1 if alone, 0 if not)
    df["Alone"] = np.where(df.Parch + df.SibSp > 0, 0, 1)
        
categorize_company(titanic)

print(titanic.head())

   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Dataset  Company  \
0      0         A/5 21171   7.2500   NaN        S   train        1   
1      0          PC 17599  71.2833   C85        C   train        1   
2      0  STON/O2. 3101282   7.9250   NaN        S   train        0   
3      0            113803  53.1000 

In [33]:
# subtract title from name

def format_name(df):
    
    # determine last name (for family indication)
    df['Lname'] = df.Name.apply(lambda x: x.split(', ')[0])
    
    # determine prefix
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(', ')[1])
    df['NamePrefix'] = df.NamePrefix.apply(lambda x: x.split(' ')[0].split('.')[0].strip())
    
    # dictionary of all options to categories
    normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the":        "Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
    }
    
    # assign title to each passenger
    df["NamePrefix"] = titanic.NamePrefix.map(normalized_titles)
    return df   


format_name(titanic)
# print(titanic.head())
print(titanic["NamePrefix"])

0            Mr
1           Mrs
2          Miss
3           Mrs
4            Mr
5            Mr
6            Mr
7        Master
8           Mrs
9           Mrs
10         Miss
11         Miss
12           Mr
13           Mr
14         Miss
15          Mrs
16       Master
17           Mr
18          Mrs
19          Mrs
20           Mr
21           Mr
22         Miss
23           Mr
24         Miss
25          Mrs
26           Mr
27           Mr
28         Miss
29           Mr
         ...   
1279         Mr
1280     Master
1281         Mr
1282        Mrs
1283     Master
1284         Mr
1285         Mr
1286        Mrs
1287         Mr
1288        Mrs
1289         Mr
1290         Mr
1291       Miss
1292         Mr
1293       Miss
1294         Mr
1295         Mr
1296         Mr
1297         Mr
1298         Mr
1299       Miss
1300       Miss
1301       Miss
1302        Mrs
1303       Miss
1304         Mr
1305    Royalty
1306         Mr
1307         Mr
1308     Master
Name: NamePrefix, Length

In [34]:
# fill in missing ages based on age, title and class
def categorize_age(df):
    
    # get values of sex, class and title
    specAge = titanic.groupby(["Sex", "Pclass", "NamePrefix"])
    
    # determine the median age of each sex-class-title combinations
    specAge.Age.median()
    
    # fill in median age of sex-class-title combination when age data is missing
    titanic.Age = specAge.Age.apply(lambda x: x.fillna(x.median()))
    
    # group dataset in age categories
    bins = (0, 4, 12, 18, 25, 35, 60, 100)
    agename = ["0-4", "4-12", "12-18", "18-25", "25-35", "35-60", "60-100"]
    df["Agegroup"] = pd.cut(df.Age, bins, labels = agename)
    
categorize_age(titanic)
print(titanic.head())

   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Dataset  Company  \
0      0         A/5 21171   7.2500   NaN        S   train        1   
1      0          PC 17599  71.2833   C85        C   train        1   
2      0  STON/O2. 3101282   7.9250   NaN        S   train        0   
3      0            113803  53.1000 

In [35]:
# delete unneccesary attributes
def drop_attributes(df):
    return df.drop(['Ticket', 'Cabin'], axis=1)

newtitanic = drop_attributes(titanic)
print(newtitanic)

      PassengerId  Survived  Pclass  \
0               1       0.0       3   
1               2       1.0       1   
2               3       1.0       3   
3               4       1.0       1   
4               5       0.0       3   
5               6       0.0       3   
6               7       0.0       1   
7               8       0.0       3   
8               9       1.0       3   
9              10       1.0       2   
10             11       1.0       3   
11             12       1.0       1   
12             13       0.0       3   
13             14       0.0       3   
14             15       0.0       3   
15             16       1.0       2   
16             17       0.0       3   
17             18       1.0       2   
18             19       0.0       3   
19             20       1.0       3   
20             21       0.0       2   
21             22       1.0       2   
22             23       1.0       3   
23             24       1.0       1   
24             25       0

In [36]:
def split_sets(df):
    # select training part of dataset for rule making
    trainset = df.loc[df['Dataset'] == 'train']

    # select test part of dataset for later
    testset = df.loc[df['Dataset'] == 'test']
    
    return trainset, testset

newtrain, newtest = split_sets(newtitanic)[0], split_sets(newtitanic)[1]

print(newtrain)
# print(newtest)

     PassengerId  Survived  Pclass  \
0              1       0.0       3   
1              2       1.0       1   
2              3       1.0       3   
3              4       1.0       1   
4              5       0.0       3   
5              6       0.0       3   
6              7       0.0       1   
7              8       0.0       3   
8              9       1.0       3   
9             10       1.0       2   
10            11       1.0       3   
11            12       1.0       1   
12            13       0.0       3   
13            14       0.0       3   
14            15       0.0       3   
15            16       1.0       2   
16            17       0.0       3   
17            18       1.0       2   
18            19       0.0       3   
19            20       1.0       3   
20            21       0.0       2   
21            22       1.0       2   
22            23       1.0       3   
23            24       1.0       1   
24            25       0.0       3   
25          

In [44]:
def gender_prediction(df):
    
    # predict that females survive, males don't
    df["Pred"] = np.where(df["Sex"] == "female", 1, 0)

# check if prediction is correct (in training data)
def check_prediction(df):
    df["Correct"] = np.where(df.Pred == df.Survived, 1, 0)
    return df["Correct"].sum()

gender_prediction(titanic)
check_prediction(titanic)

701

In [95]:
pd.options.mode.chained_assignment = None  # default='warn'

def informative_attributes(df, options, test = None):

    all_correct = []
    
    newtrain = split_sets(df)[0]
    
    for attribute in options:
        
        g = newtrain.groupby([attribute])
        rates = round(g.Survived.mean())
        df["Pred"] = df[attribute].map(rates)
        all_correct.append(check_prediction(newtrain))
        
    g = newtrain.groupby([options[all_correct.index(max(all_correct))]])
    rates = round(g.Survived.mean())
    df["Pred"] = df[options[all_correct.index(max(all_correct))]].map(rates)
        
#     titanic.merge(df, how = "left")
        
    for i in df.PassengerId:
        titanic["Pred"][titanic["PassengerId"] == i] = df["Pred"][df["PassengerId"] == i]
        
    return (max(all_correct)/len(newtrain))*100, options[all_correct.index(max(all_correct))]


def next_step_tree(df, attribute):
    
    g = df.groupby([attribute])
    categories = list(g.groups)
    
    datasets = []
    
    for c in categories:
        datasets.append(df[df[attribute] == c])
        
    return categories, datasets

def perform_next_step(datasets, parent):
    
    nodes_gen = {}
    new_datasets = []
    keys = []
    ghostnode = Node("ghost", parent = parent)
    options = get_options(ghostnode)
    tempnodes = []

    for ds in datasets:
        attr = informative_attributes(ds, options)
        key = attr[1] + str(random.randint(0, 100))
        if attr[0] < 100 and math.isnan(attr[0]) == False:
            nodes_gen[key] = Node(attr[1] + "(" + str(attr[0]) +")", parent = parent)
            keys.append(key)
            new_datasets.append(next_step_tree(ds, attr[1])[1])
        
        elif len(ds) > 0 and math.isnan(attr[0]) == False:
            tempnodes.append(Node(attr[1] + "(" + str(attr[0]) +")", parent = parent))
            
            tempnodes.append(Node(str(ds.Pred.iloc[0]), parent = tempnodes[-1]))
        

            
    ghostnode.parent = None
            
    return nodes_gen, new_datasets, keys

def get_options(node):
    
    tempoptions = ["Sex", "Agegroup", "Pclass", "Company", "Alone", "Companions"]

    for n in (node.anchestors):
        toberemoved = ''.join(filter(str.isalpha, n.name))
        if toberemoved in tempoptions:
            tempoptions.remove(toberemoved)   
    return tempoptions

    

options = ["Sex", "Agegroup", "Pclass", "Company", "Alone", "Companions"]

# dataset = titanic

print("Welkom!")
    
first_attribute = informative_attributes(titanic, options)
print("First", first_attribute)
nodes_gen0 =Node(first_attribute[1] + str(int(first_attribute[0])))
options.remove(first_attribute[1])
datasets = next_step_tree(titanic, first_attribute[1])

newnodes, newdata, newkeys = perform_next_step(datasets[1], parent = nodes_gen0)

for i in range(len(newdata)):
    newnodes_1, newdata_1, newkeys_1 = perform_next_step(newdata[i], parent = newnodes[newkeys[i]])
    for j in range(len(newnodes_1)):
        newnodes_2, newdata_2, newkeys_2 = perform_next_step(newdata_1[j], parent = newnodes_1[newkeys_1[j]])
#         for k in range(len(newnodes_2)):
#             newnodes_3, newdata_3, newkeys_3 = perform_next_step(newdata_2[k], parent = newnodes_2[newkeys_2[k]])
#             for l in range(len(newnodes_3)):
#                 perform_next_step(newdata_3[l], parent = newnodes_3[newkeys_3[l]])
            

        
for pre, fill, node in RenderTree(nodes_gen0):
    print("%s%s" % (pre, node.name))
    
print(check_prediction(titanic))


# from anytree.exporter import DotExporter
# # graphviz needs to be installed for the next line!
# DotExporter(nodes_gen0).to_picture("nodes_gen0.png")


Welkom!
First (86.19528619528619, 'Sex')
Sex86
├── Agegroup(74.20382165605095)
│   ├── Pclass(70.58823529411765)
│   │   ├── Company(100.0)
│   │   │   └── 0.0
│   │   ├── Company(100.0)
│   │   │   └── 1.0
│   │   └── Company(66.66666666666666)
│   ├── Pclass(53.333333333333336)
│   │   ├── Company(100.0)
│   │   │   └── 1.0
│   │   └── Company(72.72727272727273)
│   ├── Pclass(66.66666666666666)
│   │   ├── Company(100.0)
│   │   │   └── 1.0
│   │   ├── Company(100.0)
│   │   │   └── 1.0
│   │   └── Company(56.60377358490566)
│   ├── Pclass(76.78571428571429)
│   │   ├── Company(94.11764705882352)
│   │   ├── Company(94.11764705882352)
│   │   └── Company(50.0)
│   ├── Pclass(78.94736842105263)
│   │   ├── Company(100.0)
│   │   │   └── 1.0
│   │   ├── Company(92.0)
│   │   └── Company(54.83870967741935)
│   ├── Pclass(79.48717948717949)
│   │   ├── Company(97.72727272727273)
│   │   ├── Company(85.0)
│   │   └── Company(85.71428571428571)
│   └── Pclass(100.0)
│       └── 1.0
└── Ag

In [96]:
newtrain, newtest = split_sets(titanic)[0], split_sets(titanic)[1]
print(newtest["Pred"])

891     0.0
892     0.0
893     0.0
894     0.0
895     0.0
896     0.0
897     1.0
898     0.0
899     1.0
900     0.0
901     0.0
902     0.0
903     1.0
904     NaN
905     1.0
906     1.0
907     0.0
908     0.0
909     1.0
910     0.0
911     0.0
912     1.0
913     1.0
914     0.0
915     1.0
916     0.0
917     1.0
918     0.0
919     0.0
920     0.0
       ... 
1279    0.0
1280    0.0
1281    0.0
1282    1.0
1283    NaN
1284    0.0
1285    0.0
1286    1.0
1287    0.0
1288    1.0
1289    0.0
1290    0.0
1291    1.0
1292    0.0
1293    1.0
1294    NaN
1295    0.0
1296    0.0
1297    0.0
1298    1.0
1299    1.0
1300    1.0
1301    1.0
1302    1.0
1303    1.0
1304    0.0
1305    1.0
1306    0.0
1307    0.0
1308    0.0
Name: Pred, Length: 418, dtype: float64


In [97]:
test_example = pd.read_csv('titanic/test_example.csv')

print(len(test_example))
# print(len(newtest))

predictions = newtest["Pred"].values
# print(predictions)

test_example["Pred"] = predictions
# print(test_example.head())

print(check_prediction(test_example)/len(test_example))


418
0.8397129186602871


In [93]:
test_gender_ex = pd.read_csv('titanic/test_example.csv')

gender_prediction(newtest)
gender_preds = newtest["Pred"].values

test_gender_ex["Pred"] = gender_preds

print(check_prediction(test_gender_ex)/len(test_gender_ex))

0.8803827751196173


In [63]:
# # save new files
# def save_train(df):
#     df.to_csv('titanic/newtrain.csv')
# def save_test(df):
#     df.to_csv('titanic/newtest.csv')
    
# save_train(newtrain)
# save_test(newtest)