In [90]:
import pandas as pd

In [91]:
mapping = {'Don': 'RoyaltyM', 'Mme': 'Miss', 'Ms': 'Miss',
    'Major': 'Officer', 'Lady': 'RoyaltyF', 'Sir': 'RoyaltyM', 
    'Mlle': 'Miss', 'Col': 'Officer', 'Capt': 'Officer',
    'Countess': 'RoyaltyF', 'Jonkheer': 'RoyaltyM', 'Dona': 'RoyaltyF'}

In [92]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [93]:
# empty boarding stations = max
def populate_embarked(df):
    embarked = df.groupby('Embarked').count()['PassengerId']
    embarked_max = embarked[embarked == embarked.max()].index[0]
    df.loc[df['Embarked'].isnull(), 'Embarked'] = embarked_max
    return df

In [94]:
def set_titles(df, mapping):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')
    df.replace({'Title': mapping}, inplace=True)
    return df

In [95]:
def populate_age(df):
    age_med = df.groupby('Title')['Age'].median()
    for title, age in age_med.iteritems():
        filter_age = df['Age'].isnull()
        filter_title = df['Title'] == title
        df.loc[filter_age & filter_title, 'Age'] = age
    return df

In [96]:
train = populate_embarked(train)
train = set_titles(train, mapping)
train = populate_age(train)

In [97]:
test = populate_embarked(test)
test = set_titles(test, mapping)
test = populate_age(test)

In [98]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Title          418 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [99]:
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [100]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Title          418 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [101]:
# deleting data that not affects forecast
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 'columns')
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,S,Mr
1,1,1,female,38.0,1,0,71.2833,C,Mrs
2,1,3,female,26.0,0,0,7.925,S,Miss
3,1,1,female,35.0,1,0,53.1,S,Mrs
4,0,3,male,35.0,0,0,8.05,S,Mr


In [102]:
# creating df result and then clearing test df
result = pd.DataFrame(test['PassengerId'])
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 'columns')

In [103]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,male,34.5,0,0,7.8292,Q,Mr
1,3,female,47.0,1,0,7.0,S,Mrs
2,2,male,62.0,0,0,9.6875,Q,Mr
3,3,male,27.0,0,0,8.6625,S,Mr
4,3,female,22.0,1,1,12.2875,S,Mrs


In [104]:
# необходимо закодировать значения в числовые
# we use module scikit-learn
from sklearn.preprocessing import LabelEncoder

In [105]:
le  = LabelEncoder()
# создаем хранилище классов, чтобы значения были одинаковыми для всех таблиц
classes = {}

In [106]:
# кодируем значения
le.fit(train['Sex'])
classes['Sex'] = le.classes_
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Embarked'])
classes['Embarked'] = le.classes_
train['Embarked'] = le.transform(train['Embarked'])

le.fit(train['Title'])
classes['Title'] = le.classes_
train['Title'] = le.transform(train['Title'])

In [107]:
classes

{'Sex': array(['female', 'male'], dtype=object),
 'Embarked': array(['C', 'Q', 'S'], dtype=object),
 'Title': array(['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Officer', 'Rev', 'RoyaltyF',
        'RoyaltyM'], dtype=object)}

In [108]:
le.fit(classes['Sex'])
le.classes_

array(['female', 'male'], dtype=object)

In [109]:
le.fit(classes['Sex'])
classes['Sex'] = le.classes_
test['Sex'] = le.transform(test['Sex'])

le.fit(classes['Embarked'])
classes['Embarked'] = le.classes_
test['Embarked'] = le.transform(test['Embarked'])

le.fit(classes['Title'])
classes['Title'] = le.classes_
test['Title'] = le.transform(test['Title'])

In [110]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass      418 non-null int64
Sex         418 non-null int32
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int32
Title       418 non-null int32
dtypes: float64(2), int32(3), int64(3)
memory usage: 21.3 KB


now all cells are int or float, so we can apply ML algoritms

## Random Forest and k-nearest neighbors
### and cross validation to check the result

In [112]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [113]:
# выносим таргет и убираем из датасета
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [114]:
train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
580,2,0,25.0,1,1,30.0,2,2
67,3,1,19.0,0,0,8.1583,2,3
887,1,0,19.0,0,0,30.0,2,2
262,1,1,52.0,1,1,79.65,2,3
800,2,1,34.0,0,0,13.0,2,3
