In [36]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy

In [37]:
titanic = pd.read_csv('titanic/combi3.csv')

In [38]:
print(titanic.head())

  PassengerId  Pclass   LastName                                         Name  \
0           1     3.0     Braund                              Mr. Owen Harris   
1           2     1.0    Cumings   Mrs. John Bradley (Florence Briggs Thayer)   
2           3     3.0  Heikkinen                                  Miss. Laina   
3           4     1.0   Futrelle           Mrs. Jacques Heath (Lily May Peel)   
4           5     3.0      Allen                            Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket       Fare Cabin Embarked  \
0    male  22.0    1.0    0.0         A/5 21171       7.25   NaN        S   
1  female  38.0    1.0    0.0          PC 17599  712833.00   C85        C   
2  female  26.0    0.0    0.0  STON/O2. 3101282    7925.00   NaN        S   
3  female  35.0    1.0    0.0            113803      53.10  C123        S   
4    male  35.0    0.0    0.0            373450       8.05   NaN        S   

  Dataset  Survived  LeegPp CabinD CabinEO  
0   t

In [39]:
def categorize_company(df):
    
    # 0 if alone, 1 if only with siblings/spouse, 2 if only with parent/children, 3 if with both (number of people is disregarded)
    conditions = [(df.Parch + df.SibSp == 0), (df.Parch == 0) & (df.SibSp > 0), (df.Parch > 0) & (df.SibSp ==0),
                  (df.Parch > 0) & (df.SibSp > 0)]
    categories = [0, 1, 2, 3]
    
    # create company attribute
    df["Company"] = np.select(conditions, categories)
    
    # count companions
    df["Companions"] = df.SibSp + df.Parch
    
    # create alone attribute (1 if alone, 0 if not)
    df["Alone"] = np.where(df.Parch + df.SibSp > 0, 0, 1)
        
categorize_company(titanic)

print(titanic.head())

  PassengerId  Pclass   LastName                                         Name  \
0           1     3.0     Braund                              Mr. Owen Harris   
1           2     1.0    Cumings   Mrs. John Bradley (Florence Briggs Thayer)   
2           3     3.0  Heikkinen                                  Miss. Laina   
3           4     1.0   Futrelle           Mrs. Jacques Heath (Lily May Peel)   
4           5     3.0      Allen                            Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket       Fare Cabin Embarked  \
0    male  22.0    1.0    0.0         A/5 21171       7.25   NaN        S   
1  female  38.0    1.0    0.0          PC 17599  712833.00   C85        C   
2  female  26.0    0.0    0.0  STON/O2. 3101282    7925.00   NaN        S   
3  female  35.0    1.0    0.0            113803      53.10  C123        S   
4    male  35.0    0.0    0.0            373450       8.05   NaN        S   

  Dataset  Survived  LeegPp CabinD CabinEO  Compan

In [40]:
# subtract title from name

def format_name(df):
    
    # take prefix from remainder of name after last name has been split off
    df['NamePrefix'] = df.Name.apply(lambda x: str(x).split('.')[0].strip())
    
    # dictionary of all options to categories
    normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the":        "Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
    }
    
    # assign title to each passenger
    df["NamePrefix"] = titanic.NamePrefix.map(normalized_titles)
    return df   


format_name(titanic)
# print(titanic.head())
print(titanic["NamePrefix"])

0            Mr
1           Mrs
2          Miss
3           Mrs
4            Mr
5            Mr
6            Mr
7        Master
8           Mrs
9           Mrs
10         Miss
11         Miss
12           Mr
13           Mr
14         Miss
15          Mrs
16       Master
17           Mr
18          Mrs
19          Mrs
20           Mr
21           Mr
22          NaN
23           Mr
24         Miss
25          Mrs
26           Mr
27           Mr
28          NaN
29           Mr
         ...   
1279         Mr
1280     Master
1281         Mr
1282        Mrs
1283     Master
1284         Mr
1285         Mr
1286        Mrs
1287         Mr
1288        Mrs
1289         Mr
1290         Mr
1291       Miss
1292         Mr
1293       Miss
1294         Mr
1295         Mr
1296        NaN
1297         Mr
1298         Mr
1299        NaN
1300       Miss
1301       Miss
1302        Mrs
1303       Miss
1304         Mr
1305    Royalty
1306         Mr
1307         Mr
1308     Master
Name: NamePrefix, Length

In [41]:
# fill in missing ages based on age, title and class
def categorize_age(df):
    
    # get values of sex, class and title
    specAge = titanic.groupby(["Sex", "Pclass", "NamePrefix"])
    
    # determine the median age of each sex-class-title combinations
    specAge.Age.median()
    
    # fill in median age of sex-class-title combination when age data is missing
    titanic.Age = specAge.Age.apply(lambda x: x.fillna(x.median()))
    
    # group dataset in age categories
    bins = (0, 4, 12, 18, 25, 35, 60, 100)
    agename = ["0-4", "4-12", "12-18", "18-25", "25-35", "35-60", "60-100"]
    df["Agegroup"] = pd.cut(df.Age, bins, labels = agename)
    
categorize_age(titanic)
print(titanic.head())

  PassengerId  Pclass   LastName                                         Name  \
0           1     3.0     Braund                              Mr. Owen Harris   
1           2     1.0    Cumings   Mrs. John Bradley (Florence Briggs Thayer)   
2           3     3.0  Heikkinen                                  Miss. Laina   
3           4     1.0   Futrelle           Mrs. Jacques Heath (Lily May Peel)   
4           5     3.0      Allen                            Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket       Fare    ...    Dataset  \
0    male  22.0    1.0    0.0         A/5 21171       7.25    ...      train   
1  female  38.0    1.0    0.0          PC 17599  712833.00    ...      train   
2  female  26.0    0.0    0.0  STON/O2. 3101282    7925.00    ...      train   
3  female  35.0    1.0    0.0            113803      53.10    ...      train   
4    male  35.0    0.0    0.0            373450       8.05    ...      train   

  Survived LeegPp  CabinD  Cabin

In [46]:
# delete unneccesary attributes
def drop_attributes(df):
    return df.drop(['Ticket', 'LastName', 'Name', 'Cabin', 'LeegPp'], axis=1)

drop_attributes(titanic)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Dataset,Survived,CabinD,CabinEO,Company,Companions,Alone,NamePrefix,Agegroup
0,1,3.0,male,22.00,1.0,0.0,7.25,S,train,0.0,U,u,1,1.0,0,Mr,18-25
1,2,1.0,female,38.00,1.0,0.0,712833.00,C,train,1.0,C,o,1,1.0,0,Mrs,35-60
2,3,3.0,female,26.00,0.0,0.0,7925.00,S,train,1.0,U,u,0,0.0,1,Miss,25-35
3,4,1.0,female,35.00,1.0,0.0,53.10,S,train,1.0,C,o,1,1.0,0,Mrs,25-35
4,5,3.0,male,35.00,0.0,0.0,8.05,S,train,0.0,U,u,0,0.0,1,Mr,25-35
5,6,3.0,male,26.00,0.0,0.0,84583.00,Q,train,0.0,U,u,0,0.0,1,Mr,25-35
6,7,1.0,male,54.00,0.0,0.0,518625.00,S,train,0.0,E,e,0,0.0,1,Mr,35-60
7,8,3.0,male,2.00,3.0,1.0,21075.00,S,train,0.0,U,u,3,4.0,0,Master,0-4
8,9,3.0,female,27.00,0.0,2.0,111333.00,S,train,1.0,U,u,2,2.0,0,Mrs,25-35
9,10,2.0,female,14.00,1.0,0.0,300708.00,C,train,1.0,U,u,1,1.0,0,Mrs,12-18


In [52]:
def split_sets(df):
    # select training part of dataset for rule making
    trainset = df.loc[df['Dataset'] == 'train']

    # select test part of dataset for later
    testset = df.loc[df['Dataset'] == 'test']
    
    return trainset, testset

newtrain, newtest = split_sets(titanic)[0], split_sets(titanic)[1]

# print(newtrain)
# print(newtest)

     PassengerId  Pclass          LastName  \
891          892     3.0             Kelly   
892          893     3.0            Wilkes   
893          894     2.0             Myles   
894          895     3.0              Wirz   
895          896     3.0          Hirvonen   
896          897     3.0          Svensson   
897          898     3.0          Connolly   
898          899     2.0          Caldwell   
899          900     3.0           Abrahim   
900          901     3.0            Davies   
901          902     3.0            Ilieff   
902          903     1.0             Jones   
903          904     1.0            Snyder   
904          905     2.0            Howard   
905          906     1.0           Chaffee   
906          907     2.0         del Carlo   
907          908     2.0             Keane   
908          909     3.0             Assaf   
909          910     3.0        Ilmakangas   
911          912     1.0        Rothschild   
912          913     3.0          