In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem definition
## 1. Load datasets

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_ids = pd.read_csv('/kaggle/input/titanic/test.csv')

# Encode the data

In [3]:
train_test_data = [train, test] # combining train and test dataset

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [4]:
from sklearn.preprocessing import OrdinalEncoder

data = test.columns
encoder = OrdinalEncoder()
for x in data:
    train[[x]] = encoder.fit_transform(train[[x]])
    test[[x]] = encoder.fit_transform(test[[x]])
    
train.head()

# Imputation

In [5]:
train.isnull().sum()

In [6]:
test.isnull().sum()

In [7]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [8]:
train.head(30)
train.groupby("Title")["Age"].transform("median")

In [9]:
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mean())

In [10]:
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

## combine sibsp and parch ot sum the family size

In [11]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

## drop uncessecary data

In [12]:
train = train.drop(['Cabin', 'PassengerId' , 'SibSp', 'Parch'], axis=1)
test = test.drop(['Cabin', 'PassengerId', 'SibSp', 'Parch'], axis=1)

# Summarize data


In [13]:
train.head(10)

In [14]:
#dimensions of the data
print(train.shape)

In [15]:
#data type of each column
print(train.dtypes)

   ## 1. Using Descriptive Statistics

In [16]:
#for numerical values
train.describe()

In [17]:
columns = train.columns

for col in columns:
    print("In the column :", col)
    print(train.groupby(col).size())
    print("")

In [18]:
#correlation betweem columns
from pandas import set_option

set_option('display.width', 100)
set_option('precision', 3)
corr = train.corr(method='pearson')
print(corr)

In [19]:
#checking the distribution
'''
    Values That are far from zero are more skew
    This will help later in the data preparation
'''
skewness = train.skew()
print(skewness)

## 2. Data Visualization

In [20]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [21]:
#Histgram
train.hist()
plt.rcParams['figure.figsize'] = [20, 20]
plt.show()

In [22]:
#Histgram
train[(train['Survived'] ==1)].hist()
plt.rcParams['figure.figsize'] = [20, 20]
plt.show()

In [23]:
#Bin the age

dataset = test
dataset.loc[ dataset['Age'] <= 15, 'Age'] = 0,
dataset.loc[(dataset['Age'] > 15) & (dataset['Age'] <= 25), 'Age'] = 1,
dataset.loc[(dataset['Age'] > 25) & (dataset['Age'] <= 55), 'Age'] = 2,
dataset.loc[(dataset['Age'] > 55) & (dataset['Age'] <= 70), 'Age'] = 3,
dataset.loc[ dataset['Age'] > 70, 'Age'] = 4


dataset = train.drop('Survived', axis=1)
dataset.loc[ dataset['Age'] <= 15, 'Age'] = 0,
dataset.loc[(dataset['Age'] > 15) & (dataset['Age'] <= 25), 'Age'] = 1,
dataset.loc[(dataset['Age'] > 25) & (dataset['Age'] <= 55), 'Age'] = 2,
dataset.loc[(dataset['Age'] > 55) & (dataset['Age'] <= 70), 'Age'] = 3,
dataset.loc[ dataset['Age'] > 70, 'Age'] = 4
train['Age'] = dataset['Age']

In [24]:
#density plot - checking the distribution
train.plot(kind='density', subplots=True,sharex=False)
plt.rcParams['figure.figsize'] = [20, 20]
plt.show()

In [25]:
#box and whiskers - check for outliers
#dots are outliers
train.plot(kind='box', subplots=True, sharex=False, sharey=False)
plt.rcParams['figure.figsize'] = [20, 20]
plt.show()

## Remove outliers

### bar graph

In [26]:
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [27]:
bar_chart('Sex')

In [28]:
bar_chart('Age')

In [29]:
bar_chart('Pclass')

In [30]:
bar_chart('FamilySize')

In [31]:
bar_chart('Embarked')

In [32]:
#scatter matrix
from pandas.plotting import scatter_matrix

colors = {0:'red', 1:'green'}
scatter_matrix(train, color=train['Survived'].map(colors))
plt.show()

In [33]:
#correlation matrix
import seaborn as sn

correlations = train.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
sn.heatmap(correlations, annot=True)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(train.columns)
ax.set_yticklabels(train.columns)
plt.rcParams['figure.figsize'] = [30, 30]
plt.show()

# Data Prep

## Split the data into input and output

In [34]:
input_data = train.drop('Survived', axis=1)
output_data = train['Survived']

## scalling

In [35]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaled_input = scaler.fit_transform(input_data)
scaled_test = scaler.fit_transform(test)

In [36]:
rows,cols = scaled_input.shape

for i,data in zip(range(rows), input_data.columns):
    input_data[data] = scaled_input[:,i]

In [37]:
rows,cols = scaled_test.shape

for i,data in zip(range(rows), test.columns):
    test[data] = scaled_test[:,i]

# Feature Selection

In [38]:
features_drop = ['Ticket']
input_data = input_data.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

# Model

In [39]:
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

x = input_data.values
y = output_data.values
kfold = KFold(n_splits=10)
# Spot-Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(criterion='entropy', max_depth=3)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier(200)))

ensemble = VotingClassifier(models)
result = cross_val_score(ensemble, x, y,cv=kfold)
result.mean()

In [40]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, x, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [41]:
ensemble.fit(x,y)
predictions = ensemble.predict(test)

In [42]:
test['Survived'] = predictions

In [43]:
output = pd.DataFrame({'PassengerId': test_ids.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [44]:
test