In [1]:
import os, pandas, numpy
from matplotlib import pyplot
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score

scaler = MinMaxScaler()

In [2]:
train = pandas.read_csv('train.csv')
test = pandas.read_csv('test.csv')
train.head(3)
train["Embarked"].replace(['S', 'C', 'Q'], [1, 0, 2], inplace=True)
max_embarked = train['Embarked'].value_counts().idxmax()
train["Embarked"] = train["Embarked"].fillna(max_embarked)
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
train["Sex"].replace(['male', 'female'], [1, 0], inplace=True)
age_mediana = train.Age.median()
train["Age"] = train["Age"].apply(lambda a: age_mediana if numpy.isnan(a) else a)
train["Fare"] = train["Fare"].apply(lambda f: int(f/10) if not numpy.isnan(f) else None)
max_fare = train['Fare'].value_counts().idxmax()
train["Fare"].fillna(max_fare, inplace=True)
norm_list = ['Pclass', 'Fare', 'Age', 'SibSp', 'Parch', 'Embarked']
train[norm_list] = pandas.DataFrame(scaler.fit_transform(train[norm_list]))
train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,1.0,1,0.271174,0.125,0.0,0.0,0.5
1,1,0.0,0,0.472229,0.125,0.0,0.137255,0.0
2,1,1.0,0,0.321438,0.0,0.0,0.0,0.5
3,1,0.0,0,0.434531,0.125,0.0,0.098039,0.5
4,0,1.0,1,0.434531,0.0,0.0,0.0,0.5
5,0,1.0,1,0.346569,0.0,0.0,0.0,1.0
6,0,0.0,1,0.673285,0.0,0.0,0.098039,0.5
7,0,1.0,1,0.019854,0.375,0.166667,0.039216,0.5
8,1,1.0,0,0.334004,0.0,0.333333,0.019608,0.5
9,1,0.5,0,0.170646,0.125,0.0,0.058824,0.0


In [3]:
target = train['Survived']
train.drop('Survived', axis=1, inplace=True)
train_x, test_x, train_y, test_y = train_test_split(train, target, test_size=0.20, stratify=target)

<p>k-nearest neighbors algorithm:

In [4]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)

y_pred = knn.predict(test_x)

accuracy_score(test_y, y_pred)

scaler.fit(train_x)

train_x_t = scaler.transform(train_x)
test_x_t = scaler.transform(test_x)
knn_t = KNeighborsClassifier(n_neighbors=5)
knn_t.fit(train_x_t, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [5]:
y_pred_t = knn_t.predict(test_x_t)
accuracy_score(test_y, y_pred_t)

0.7988826815642458

In [6]:
for i in xrange(5, 15):
    knn_t = KNeighborsClassifier(n_neighbors=i)
    knn_t.fit(train_x_t, train_y)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    print(i, accuracy_score(test_y, knn.predict(test_x)), accuracy_score(test_y, knn_t.predict(test_x_t)))

(5, 0.7988826815642458, 0.7988826815642458)
(6, 0.8044692737430168, 0.8044692737430168)
(7, 0.79329608938547491, 0.79329608938547491)
(8, 0.8044692737430168, 0.8044692737430168)
(9, 0.79329608938547491, 0.79329608938547491)
(10, 0.81005586592178769, 0.81005586592178769)
(11, 0.78770949720670391, 0.78770949720670391)
(12, 0.81005586592178769, 0.81005586592178769)
(13, 0.7988826815642458, 0.7988826815642458)
(14, 0.81005586592178769, 0.81005586592178769)


<p>Classification and regression tree:

In [7]:
from sklearn import tree
train_x, test_x, train_y, test_y = train_test_split(train, target, test_size=0.20, stratify=target)

for i in xrange(2, 10):
    clf = tree.DecisionTreeClassifier(min_samples_split=i)
    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)
    print(accuracy_score(pred, test_y))

0.804469273743
0.821229050279
0.821229050279
0.843575418994
0.860335195531
0.877094972067
0.877094972067
0.877094972067


<p> Random forest:

In [8]:
from sklearn.ensemble import RandomForestClassifier
train_x, test_x, train_y, test_y = train_test_split(train, target, test_size=0.20, stratify=target)
# Функция для обучения и оценки классификатора
def train_and_estimate(train_x, train_y, test_x, test_y):
    clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=2)  # Задаем параметры случайного леса
    clf.fit(train_x, train_y)  # Обучаем на обучающей выборке
    return accuracy_score(test_y, clf.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

train_and_estimate(train_x, train_y, test_x, test_y)

0.79329608938547491