In [44]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [45]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [46]:
mapping = {'Don': 'RoyaltyM', 'Mme': 'Miss', 'Ms': 'Miss',
    'Major': 'Officer', 'Lady': 'RoyaltyF', 'Sir': 'RoyaltyM', 
    'Mlle': 'Miss', 'Col': 'Officer', 'Capt': 'Officer',
    'Countess': 'RoyaltyF', 'Jonkheer': 'RoyaltyM', 'Dona': 'RoyaltyM' }

In [47]:
def populate_embarked(df):
    embarked = df.groupby('Embarked').count()['PassengerId']
    embarked_max = embarked[embarked == embarked.max()].index[0]
    df.loc[df['Embarked'].isnull(), 'Embarked'] = embarked_max
    return df

In [48]:
def set_titles(df, mapping):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')
    df.replace({'Title': mapping}, inplace=True)
    return df

In [49]:
def populate_age(df):
    age_med = df.groupby('Title')['Age'].median()
    for title, age in age_med.iteritems():
        filter_age = df['Age'].isnull()
        filter_title = df['Title'] == title
        df.loc[(filter_age & filter_title, 'Age')] = age
    return df

In [50]:
train = populate_embarked(train)
train = set_titles(train, mapping)
train = populate_age(train)

In [51]:
test = populate_embarked(test)
test = set_titles(test, mapping)
test = populate_age(test)

In [52]:
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Fare'][test['Fare'].isnull()] = test['Fare'].median()


In [53]:
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [54]:
result = pd.DataFrame(test['PassengerId'])
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [55]:
le = LabelEncoder()
classes = {}

In [56]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,S,Mr
1,1,1,female,38.0,1,0,71.2833,C,Mrs
2,1,3,female,26.0,0,0,7.925,S,Miss
3,1,1,female,35.0,1,0,53.1,S,Mrs
4,0,3,male,35.0,0,0,8.05,S,Mr


In [57]:
le.fit(train['Sex'])
classes['Sex'] = le.classes_
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Embarked'])
classes['Embarked'] = le.classes_
train['Embarked'] = le.transform(train['Embarked'])

le.fit(train['Title'])
classes['Title'] = le.classes_
train['Title'] = le.transform(train['Title'])

In [58]:
classes

{'Sex': array(['female', 'male'], dtype=object),
 'Embarked': array(['C', 'Q', 'S'], dtype=object),
 'Title': array(['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Officer', 'Rev', 'RoyaltyF',
        'RoyaltyM'], dtype=object)}

In [59]:
le.fit(classes['Sex'])
test['Sex'] = le.transform(test['Sex'])

le.fit(classes['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

le.fit(classes['Title'])
test['Title'] = le.transform(test['Title'])

In [60]:
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [96]:
model_rf = RandomForestClassifier(n_estimators=200)
model_kn = KNeighborsClassifier(n_neighbors=20)

In [98]:
scores = cross_val_score(model_rf, train, target, cv=5)
print(scores)
print(scores.mean())

[0.79329609 0.79775281 0.84831461 0.75842697 0.83146067]
0.80585022911305


In [99]:
scores = cross_val_score(model_kn, train, target, cv=5)
print(scores)
print(scores.mean())

[0.60335196 0.73595506 0.71348315 0.73033708 0.73595506]
0.7038164584771828


In [105]:
model_rf = model_rf.fit(train, target)

In [106]:
predictions = model_rf.predict(test)

In [107]:
result.insert(1, 'Survived', predictions)

ValueError: cannot insert Survived, already exists

In [108]:
result.to_csv('titanic\result.csv', index = False)

OSError: [Errno 22] Invalid argument: 'titanic\result.csv'

In [109]:
result.to_csv('titanic/result.csv', index = False)