In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # устанавливаем seaborn как по-умолчанию для отрисовки графиков
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/titanic/gender_submission.csv') 
df.head()

In [None]:

test = pd.read_csv('/kaggle/input/titanic/test.csv') 
test.head()

In [None]:

train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()

In [None]:
train.head(10)

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.describe(include=['O'])

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.isnull().sum()

In [None]:
survived = train[train['Survived'] == 1]
not_survived = train[train['Survived'] == 0]

print ("Survived: %i (%.1f%%)"%(len(survived), (len(survived))/len(train)*100.0))
print ("Not Survived: %i (%.1f%%)"%(len(not_survived), (len(not_survived))/len(train)*100.0))
print ("Total: %i"%len(train))

In [None]:
train.Pclass.value_counts()

In [None]:
train.groupby('Pclass').Survived.value_counts()

In [None]:
train[['Pclass', 'Survived']].groupby('Pclass', as_index = False).mean()

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train)

In [None]:
train.Sex.value_counts()

In [None]:
train.groupby('Sex').Survived.value_counts()

In [None]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
sns.barplot(x='Sex', y='Survived', data=train, errorbar=None)

In [None]:
tab = pd.crosstab(train['Pclass'], train['Sex'])
print (tab)

In [None]:
tab.div(tab.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('Pclass')
plt.ylabel('Percentage')

In [None]:
sns.catplot(x='Pclass', y='Survived', hue='Sex', aspect=2, data=train, kind='point', errorbar=None);

### Pclass, Sex & Embarked vs. Survival

In [None]:
tab = pd.crosstab(train['Embarked'], train['Pclass'])
print (tab)


In [None]:
tab.div(tab.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('Embarked')
plt.ylabel('Percentage')

In [None]:
sns.catplot(x='Pclass', y='Survived', hue='Embarked', aspect=2, data=train, kind='point', errorbar=None);

### Embarked vs. Survived

In [None]:
train.Embarked.value_counts()

In [None]:
train[['Embarked', 'Survived']].groupby(['Embarked']).mean()

In [None]:
sns.barplot(x='Embarked', y='Survived', data=train, errorbar=None)

### Parch vs. Survival

In [None]:
train.Parch.value_counts()

In [None]:
train.groupby('Parch').Survived.value_counts()

In [None]:
train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean()

In [None]:
sns.barplot(x='Parch', y='Survived', errorbar=None, data=train) # ci=None will hide the error bar

### SibSp vs. Survival

In [None]:
train.SibSp.value_counts()

In [None]:
train.groupby('SibSp').Survived.value_counts()

In [None]:
train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

In [None]:
sns.barplot(x='SibSp', y='Survived', errorbar=None, data=train) # ci=None will hide the error bar

### Age vs. Survival

In [None]:
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)


sns.violinplot(x="Embarked", y="Age", hue="Survived", data=train, split=True, ax=ax1)
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=train, split=True, ax=ax2)
sns.violinplot(x="Sex", y="Age", hue="Survived", data=train, split=True, ax=ax3)

In [None]:
total_survived = train[train['Survived']==1]
total_not_survived = train[train['Survived']==0]
male_survived = train[(train['Survived']==1) & (train['Sex']=="male")]
female_survived = train[(train['Survived']==1) & (train['Sex']=="female")]
male_not_survived = train[(train['Survived']==0) & (train['Sex']=="male")]
female_not_survived = train[(train['Survived']==0) & (train['Sex']=="female")]

# Create the figure and subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10)) 

sns.histplot(total_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue', ax=axes[0, 0], label='Survived')
sns.histplot(total_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', ax=axes[0, 0], label='Not Survived')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_title('All Passengers')
axes[0, 0].legend()

sns.histplot(female_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue', ax=axes[0, 1], label='Survived')
sns.histplot(female_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', ax=axes[0, 1], label='Not Survived')
axes[0, 1].set_xlabel('Female Age')
axes[0, 1].set_title('Female Passengers')
axes[0, 1].legend()

sns.histplot(male_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue', ax=axes[1, 0], label='Survived')
sns.histplot(male_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', ax=axes[1, 0], label='Not Survived')
axes[1, 0].set_xlabel('Male Age')
axes[1, 0].set_title('Male Passengers')
axes[1, 0].legend()

plt.tight_layout() 
plt.show()

In [None]:
train.info()

In [None]:
corr_matrix = train.select_dtypes(include=['int64', 'float64']).corr()
corr_matrix

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(corr_matrix, vmax=0.6, square=True, annot=True)

In [None]:
test.Name

In [None]:
train_test_data = [train, test]

for dataset in train_test_data:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.')


In [None]:
train.head()

In [None]:
pd.crosstab(train['Title'], train['Sex'])

In [None]:
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', \
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Other": 5}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(5)

In [None]:
train.head()

In [None]:
sex_mapping = {'female': 0, 'male': 1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [None]:
train.head()

In [None]:
train.Embarked.unique()

In [None]:
train.Embarked.value_counts()

In [None]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
train.head()

In [None]:
train.Embarked.unique()

In [None]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [None]:
train.head()

In [None]:
for dataset in train_test_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()

    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

for dataset in train_test_data:
    dataset['AgeBand'] = pd.cut(train['Age'], 5)

print (train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean())

In [None]:
train.head()

In [None]:
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

train.head()

In [None]:
for dataset in train_test_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

In [None]:
for dataset in train_test_data:
    dataset['FareBand'] = pd.qcut(train['Fare'], 4)

print (train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())

In [None]:
train.head()

Делаем маппинг между *Fare* и *FareBand*

In [None]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train.head()

In [None]:
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

In [None]:
for dataset in train_test_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

In [None]:
train.head(1)

In [None]:
test.head(1)

In [None]:
features_drop = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FareBand', 'AgeBand']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(train.corr(), vmax=0.6, square=True, annot=True)

In [None]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

X_test = test.drop("PassengerId", axis=1).copy()

X_train.shape, y_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_reg = round( clf.score(X_train, y_train) * 100, 2)
print (str(acc_log_reg) + ' percent')

SVC

In [None]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
acc_svc = round(clf.score(X_train, y_train) * 100, 2)
print (acc_svc)

Linear SVC

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred_linear_svc = clf.predict(X_test)
acc_linear_svc = round(clf.score(X_train, y_train) * 100, 2)
print (acc_linear_svc)

KNN

In [None]:
clf = KNeighborsClassifier(n_neighbors = 3)
clf.fit(X_train, y_train)
y_pred_knn = clf.predict(X_test)
acc_knn = round(clf.score(X_train, y_train) * 100, 2)
print (acc_knn)

Decision Tree

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred_decision_tree = clf.predict(X_test)
acc_decision_tree = round(clf.score(X_train, y_train) * 100, 2)
print (acc_decision_tree)

Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest = clf.predict(X_test)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print (acc_random_forest)

Gausian NB

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_gnb = clf.predict(X_test)
acc_gnb = round(clf.score(X_train, y_train) * 100, 2)
print (acc_gnb)

Perceptron

In [None]:
clf = Perceptron(max_iter=5, tol=None)
clf.fit(X_train, y_train)
y_pred_perceptron = clf.predict(X_test)
acc_perceptron = round(clf.score(X_train, y_train) * 100, 2)
print (acc_perceptron)

SGD

In [None]:
clf = SGDClassifier(max_iter=5, tol=None)
clf.fit(X_train, y_train)
y_pred_sgd = clf.predict(X_test)
acc_sgd = round(clf.score(X_train, y_train) * 100, 2)
print (acc_sgd)

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest_training_set = clf.predict(X_train)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print ("Accuracy: %i %% \n"%acc_random_forest)

class_names = ['Survived', 'Not Survived']




cnf_matrix = confusion_matrix(y_train, y_pred_random_forest_training_set)
cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]


true_class_names = ['True Survived', 'True Not Survived']
predicted_class_names = ['Predicted Survived', 'Predicted Not Survived']

df_cnf_matrix = pd.DataFrame(cnf_matrix,
                             index = true_class_names,
                             columns = predicted_class_names)

df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent,
                                     index = true_class_names,
                                     columns = predicted_class_names)

plt.figure(figsize = (15,5))

plt.subplot(121)
sns.heatmap(df_cnf_matrix, annot=True, fmt='d')

plt.subplot(122)
sns.heatmap(df_cnf_matrix_percent, annot=True)

Сравним этот результат с результатом Perceptron

In [None]:
clf = Perceptron(max_iter=5, tol=None)
clf.fit(X_train, y_train)
y_pred_perceptron = clf.predict(X_train)

acc_perceptron = round(clf.score(X_train, y_train) * 100, 2)
print ("Accuracy: %i %% \n"%acc_perceptron)


cnf_matrix = confusion_matrix(y_train, y_pred_perceptron)
cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

df_cnf_matrix = pd.DataFrame(cnf_matrix,
                             index = true_class_names,
                             columns = predicted_class_names)

df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent,
                                     index = true_class_names,
                                     columns = predicted_class_names)

plt.figure(figsize = (15,5))

ax1 = plt.subplot(121)
sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax1.title.set_text('Perceptron: values')

ax2 = plt.subplot(122)
sns.heatmap(df_cnf_matrix_percent, annot=True)
ax2.title.set_text('Perceptron: %')

## Сравнение моделей



In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machines', 'Linear SVC',
              'KNN', 'Decision Tree', 'Random Forest', 'Naive Bayes',
              'Perceptron', 'Stochastic Gradient Decent'],

    'Score': [acc_log_reg, acc_svc, acc_linear_svc,
              acc_knn,  acc_decision_tree, acc_random_forest, acc_gnb,
              acc_perceptron, acc_sgd]
    })

models.sort_values(by='Score', ascending=False)

# Ансамбли Моделей


In [None]:
from sklearn.ensemble import VotingClassifier


model1 = Perceptron(max_iter=5, tol=None)
model2 = SGDClassifier(max_iter=5, tol=None)
model3 = GaussianNB()

model = VotingClassifier(estimators=[('perceptron', model1), ('sdg', model2), ('nb', model3)], voting='hard')
model.fit(X_train,y_train)

y_pred_voter = model.predict(X_train)
acc_voter = round(model.score(X_train, y_train) * 100, 2)
print ("Accuracy: %i %% \n"%acc_voter)


cnf_matrix = confusion_matrix(y_train, y_pred_voter)
cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

df_cnf_matrix = pd.DataFrame(cnf_matrix,
                             index = true_class_names,
                             columns = predicted_class_names)

df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent,
                                     index = true_class_names,
                                     columns = predicted_class_names)

plt.figure(figsize = (15,5))

ax1 = plt.subplot(121)
sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax1.title.set_text('Voting: values')

ax2 = plt.subplot(122)
sns.heatmap(df_cnf_matrix_percent, annot=True)
ax2.title.set_text('Voting: %')

In [None]:
test.head()

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest_training_set = clf.predict(X_train)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print ("Accuracy: %i %% \n"%acc_random_forest)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": clf.predict(X_test)
    })

In [None]:
submission.to_csv('submission.csv', index=False)