In [69]:
import pandas as pd
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

In [70]:
pd.set_option('precision', 2)

In [71]:
df_train = pd.read_csv('data/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [72]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [73]:
df_train.Age.fillna(df_train.Age.median(), inplace=True)
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.36,0.52,0.38,32.2
std,257.35,0.49,0.84,13.02,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [74]:
# Find all the unique genders -- the column appears to contain only male and female.
print(df_train["Sex"].unique())

# Replace all the occurences of male with the number 0.
df_train.loc[df_train["Sex"] == "male", "Sex"] = 0
df_train.loc[df_train["Sex"] == "female", "Sex"] = 1

['male' 'female']


In [75]:
print(df_train.groupby("Embarked").count()["Age"])
df_train.Embarked.fillna('S', inplace=True)
df_train.loc[df_train.Embarked == 'S', 'Embarked'] = 0
df_train.loc[df_train.Embarked == 'C', 'Embarked'] = 1
df_train.loc[df_train.Embarked == 'Q', 'Embarked'] = 2

Embarked
C    168
Q     77
S    644
Name: Age, dtype: int64


In [76]:
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(df_train.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (df_train[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = df_train["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(df_train[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [77]:
import numpy as np

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = 1.0 * sum(predictions == df_train["Survived"]) / len(predictions)
print(accuracy)

0.783389450056


In [78]:
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, df_train[predictors], df_train["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.787878787879


In [84]:
df_test = pd.read_csv("data/test.csv")

df_test.Age.fillna(df_train.Age.median(), inplace=True)
df_test.loc[df_test.Sex == 'male', 'Sex'] = 0
df_test.loc[df_test.Sex == 'female', 'Sex'] = 1

df_test.Embarked.fillna('S', inplace=True)
df_test.loc[df_test.Embarked == 'S', "Embarked"] = 0
df_test.loc[df_test.Embarked == 'C', "Embarked"] = 1
df_test.loc[df_test.Embarked == 'Q', "Embarked"] = 2

df_test.Fare.fillna(df_test.Fare.median(), inplace=True)

In [87]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)
# Train the algorithm using all the training data
alg.fit(df_train[predictors], df_train["Survived"])
# Make predictions using the test set.
predictions = alg.predict(df_test[predictors])
# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({"PassengerId": df_test["PassengerId"], "Survived": predictions})