In [3]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:700px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

### Import modules

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import svm



### Dataset

In [5]:
data = pd.read_csv('data-titanic.csv')

In [6]:
data.head()
data.info()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 107.4+ KB


### Data preparation

In [7]:
data = data.drop(['name', 'ticket', 'cabin', 'body', 'boat', 'home.dest'], axis=1)
data = data.dropna()

encoded_data = data.copy()
le = preprocessing.LabelEncoder()
encoded_data.sex = le.fit_transform(encoded_data.sex)
encoded_data.embarked = le.fit_transform(encoded_data.embarked)
features = encoded_data.drop(['survived'], axis=1).values
labels = encoded_data['survived'].values

### Training a model using Logistic Regression

In [8]:
model = LogisticRegression()

In [9]:
model.fit(features, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Making predictions using Logistic Regression

In [10]:
predictions = model.predict(features)

### Classification Accuracy Score

In [11]:
metrics.accuracy_score(labels, predictions)

0.79002876318312565


### Training model using `KNeighborsClassifier`

In [12]:
model = KNeighborsClassifier(n_neighbors=1)

In [13]:
model.fit(features, labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

### Getting the predictions

In [14]:
predictions = model.predict(features)

### Scoring the predictions

In [15]:
metrics.accuracy_score(labels, predictions)

0.97507190795781395

### Setting `n_neighbors` to a different value

In [16]:
model = KNeighborsClassifier(n_neighbors=10)

In [17]:
model.fit(features, labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

### Getting the predictions

In [18]:
predictions = model.predict(features)

### Scoring the predictions

In [19]:
metrics.accuracy_score(labels, predictions)

0.72483221476510062

### Split data into training and test sets

In [20]:
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.4, random_state=0)

In [21]:
model = LogisticRegression()

In [22]:
model.fit(features_train, labels_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Getting the predictions

In [23]:
predictions = model.predict(features_test)

### Scoring the predictions

In [24]:
metrics.accuracy_score(labels_test, predictions)

0.80382775119617222

### Estimator Score Method

In [25]:
clf = svm.SVC(kernel='linear', C=1).fit(features_train, labels_train)

In [26]:
clf.score(features_test, labels_test)  

0.78947368421052633