In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('C:/Users/.../Titanic/train.csv')
test = pd.read_csv('C:/Users/.../Titanic/test.csv')

In [None]:
print(train.columns.values)

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.info()
print(' ')
test.info()

In [None]:
train.describe()

In [None]:
train.describe(include=['O'])

In [None]:
train[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Pclass', ascending=True)

In [None]:
train[['Sex', 'Survived']].groupby(['Sex']).mean()

In [None]:
train[['Parch', 'Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=False)

In [None]:
train[['SibSp', 'Survived']].groupby(['SibSp']).mean().sort_values(by='Survived', ascending=False)

In [None]:
ages = sns.FacetGrid(train, col='Survived')
ages.map(plt.hist, 'Age', bins=20)

In [None]:
grid = sns.FacetGrid(train, col='Survived', row='Pclass')
grid.map(plt.hist, 'Age', color = 'green', bins=20)
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train, row='Embarked', height=2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex')
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train, row='Embarked', col='Survived')
grid.map(sns.barplot, 'Sex', 'Fare', color='red')
grid.add_legend()

In [None]:
train = train.drop(['Ticket', 'Cabin'], axis=1)
test = test.drop(['Ticket', 'Cabin'], axis=1)

In [None]:
merge = [train, test]

In [None]:
for dataset in merge:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

In [None]:
for dataset in merge:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',\
        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Distinct')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [None]:
train[['Title', 'Survived']].groupby(['Title']).mean().sort_values(by='Survived', ascending=False)

In [None]:
title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Distinct':5}
for dataset in merge:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
train.head()

In [None]:
train = train.drop(['Name', 'PassengerId'], axis=1)
test = test.drop(['Name'], axis=1)
merge = [train, test]

In [None]:
for dataset in merge:
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)

In [None]:
train.head()

In [None]:
freq_port = train.Embarked.dropna().mode() [0]

In [None]:
for dataset in merge:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

train[['Embarked', 'Survived']].groupby(['Embarked']).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in merge:
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [None]:
train.head()

In [None]:
for dataset in merge:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_value = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_value
    dataset['Age'] = dataset['Age'].astype(int)

In [None]:
train['AgeBand'] = pd.cut(train['Age'], 5)

In [None]:
for dataset in merge:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

In [None]:
train.head(10)

In [None]:
train = train.drop(['AgeBand'], axis=1)
merge = [train, test]

In [None]:
for dataset in merge:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    train[['FamilySize', 'Survived']].groupby(['FamilySize']).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in merge:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    train[['IsAlone', 'Survived']].groupby(['IsAlone']).mean()

In [None]:
train = train.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test = test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
merge = [train, test]

In [None]:
for dataset in merge:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass
    train.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:
print(train.head(10))


In [None]:
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)

In [None]:
train['FareBand'] = pd.qcut(train['Fare'], 4)
train[['FareBand', 'Survived']].groupby(['FareBand']).mean().sort_values(by='FareBand', ascending=True)

In [None]:
for dataset in merge:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    train = train.drop(['FareBand'], axis=1)
    merge = [train, test]

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
test.head(10)

In [None]:
test.info()

In [None]:
x_train = train.drop("Survived", axis=1)
y_train = train["Survived"]
x_test  = test.drop("PassengerId", axis=1).copy()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

In [None]:
#Logistic regression sample

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
#Naive Bayes Classifier

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)
acc_gaussian = round(gaussian.score(x_train, y_train) * 100, 2)
acc_gaussian

In [None]:
#Support Vector Machine

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
acc_svc = round(svc.score(x_train, y_train) * 100, 2)
acc_svc

In [None]:
#KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
acc_knn = round(knn.score(x_train, y_train) * 100, 2)
acc_knn

In [None]:
#Perceptron

perceptron = SGDClassifier()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)
acc_perceptron

In [None]:
#Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
acc_random_forest

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              ]})
models.sort_values(by='Score', ascending=False)

In [None]:
pd.set_option('display.max_rows', 500)
output = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": y_pred})
print(output)

In [None]:
output.to_csv('RESULTS.csv')