# Let's Submit a Model to Kaggle! -> Titanic Competition

* Data is from: https://www.kaggle.com/c/titanic
* Recommended notebook: https://github.com/justmarkham/DAT8/blob/master/notebooks/13_advanced_model_evaluation.ipynb

## Load your librarires and read in your data:

In [28]:
import pandas as pd

train = pd.read_csv("train.csv")
train.shape

(891, 12)

The shape of the data is 891 rows or passengers, and 12 variables or features that describe each passenger

In [29]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check for missing values
Some missing values might not matter, but others might be really important for the model. In this case, we could impute the data (make educated guesses about what those values might be) or remove those rows with missing data.

In [30]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cabin might not matter for this model, but age can be a determining factor as to whether someone might survive a sinking ship. We could remove all rows where `Age` is null, but if there are nulls in the `Age` column for the test set, the model will fail. Instead, we can impute or guess the ages for these cells using the mean, median or mode.

In [31]:
train[train.Age.notnull()].shape

(714, 12)

In [32]:
print("mean Age: ", train.Age.mean())
print("mose frequent Age: ", train.Age.mode())
print("median Age: ", train.Age.median())

mean Age:  29.69911764705882
mose frequent Age:  0    24.0
dtype: float64
median Age:  28.0


Lets go with the median

In [33]:
train.Age.fillna(train.Age.median(), inplace=True)

In [34]:
train.shape

(891, 12)

## Lets make `Sex` a binary rather than a string (models don't like strings):

In [35]:
train['Sex_Female'] = train.Sex.map({'male':0, 'female':1})

In [36]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Female
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


## Select columns to include in model

In [37]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_Female'],
      dtype='object')

In [38]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Female' ]

X = train[features]
y = train['Survived']

## Train our model using a [logistic regression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) for `survived` vs `not survived`

In [39]:
# split training data into 80% training and 20% testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

# make predictions for testing set
y_pred_class = logreg.predict(X_test)

# calculate testing accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.810055865922


## Train this model using all the data, and apply to `test.csv` to assign our predictions

In [65]:
# Read in test data:
test = pd.read_csv("test.csv", index_col='PassengerId')
test.shape

(418, 10)

In [66]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [67]:
test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [68]:
# Impute median age
test.Age.fillna(train.Age.median(), inplace=True)

In [69]:
# Impute fare:
print(train.Fare.mean())
print(train.Fare.median())

32.2042079685746
14.4542


In [70]:
test.Fare.fillna(train.Fare.median(), inplace=True)

In [72]:
# binarise sex
test['Sex_Female'] = test.Sex.map({'male':0, 'female':1})

# select only columns that we used as features in our training
test = test[features]
test.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,34.5,0,0,7.8292,0
893,3,47.0,1,0,7.0,1
894,2,62.0,0,0,9.6875,0
895,3,27.0,0,0,8.6625,0
896,3,22.0,1,1,12.2875,1


In [73]:
# Train model on all train data
logreg.fit(X, y)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [74]:
# apply model to test data
results = logreg.predict(test)

In [75]:
# 0 = not survived, 1 = survived
results

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0,

In [76]:
# add `results` as a column to our test dataframe
test['Survived'] = results

# remove all other columns - keep new `Survived col`
submission_export = test[['Survived']]

# save as csv. Often you'll see people adding `index=False` but we want to keep the index cos it holds the `PassengerID`
submission_export.to_csv('Titanic_model_submission.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
