In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![Estonia-ferry-1280x720.jpg](attachment:Estonia-ferry-1280x720.jpg)

# **A brief story**

**MS Estonia Disaster:**

The MS Estonia sinking occurred in the year 1994 in the month of September as the vessel was passing through the Baltic Sea from the Estonian province of Tallinn to her intended destination of Stockholm. At the time of the MS Estonia disaster nearly 1,000 people were aboard it, both voyagers and crewing personnel included and only a minuscule proportion of people were managed to be successfully saved from the wreckage of the unfortunate vessel.

The timing of the voyage was such that turbulent climatic conditions plagued the transiting through the Baltic Sea. Heavy gales lashed the vessel, making steerage difficult whilst trying to maintain the speed of the vessel. Thus though a common thread about the actual causation of the disaster remains that the weather played a major role in destabilising the ship, sources vary in their account of how the vessel started to heel in the Baltic Sea.

* I know that Proofling is an excellent tool but I prefer to dissect the data manually.

# **Importing the libraries**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv')

In [None]:
df.head()

In [None]:
df.describe()

# **Lets check if there is some null data**

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Let's check it if realy theres no null data:

In [None]:
df.isnull().sum()

# None, cool, lets go on.

In [None]:
df['Sex'].replace('M',1,inplace=True)
df['Sex'].replace('F',0,inplace=True)
df['Category'].replace('P',1,inplace=True)
df['Category'].replace('C',0,inplace=True)

# As you guys can see above, I replaced the 'Sex' and 'Category' in a binary numbers.Now, lets check the correlated sets.

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)

# Hummm, the best correlated are Age and Category. Let's keep digging...

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Survived',y='Age',data=df)

# * Unfortunatly the most of people that dead was about 0 and 40 years old (mean), and the few ones who survived was between 0 and 30 years old.

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Category',y='Age',data=df,hue='Survived')

# That's the answer. The Age of survivors beetween passengers and crew is almost the same with the exception of  survivors passengers.

In [None]:
df['Country'].unique()

# * There's a great variety of people from many countries

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Survived',y='Age',data=df,hue='Sex')

# * Above we can see more clearly than the average of deaths, both male and female is over 40 years old. The survivors, on the other hand, are under 40 years old.


# How many passengers died according to nationality?

# A top five countries

In [None]:
df[(df['Survived']==0)&(df['Category']==1)]['Country'].value_counts().head(5)

# Of these, how many were adults?

In [None]:
df[(df['Survived']==0)&(df['Category']==1)&(df['Age']>=18)]['Country'].value_counts().head(5)

# How many were children?

In [None]:
df[(df['Survived']==0)&(df['Category']==1)&(df['Age']<18)]['Country'].value_counts().head(5)

# How many people survived according to nationality?

In [None]:
df[(df['Survived']==1)&(df['Category']==1)]['Country'].value_counts().head(5)

# How many people were adults?

In [None]:
df[(df['Survived']==1)&(df['Category']==1)&(df['Age']>=18)]['Country'].value_counts().head(5)

# How many children survived?

In [None]:
df[(df['Survived']==1)&(df['Category']==1)&(df['Age']<18)]['Country'].value_counts().head(5)

# How many crew members perish?

In [None]:
df[(df['Category']==0)&(df['Survived']==0)]['Country'].value_counts()

# Well, let's check out our chances of survival. Cross your fingers.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df[['Sex','Age','Category']]
y = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions_dtree = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [None]:
print(confusion_matrix(y_test,predictions_dtree))
print(classification_report(y_test,predictions_dtree))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,predictions_dtree),3)*100,'%')

# * Well, it's quite certain we're all goin' to die. But don't worry, maybe the Random Forrest could save us.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
rfc_predict = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_predict))
print(classification_report(y_test,rfc_predict))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,rfc_predict),3)*100,'%')

# * Holly sh ...., this is almost the same score as Decision Tree. I think paradise could be a good place, after all I am already 40 years old.


# * Let's test one more predictor

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train,y_train)

In [None]:
predict_LR = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
print(classification_report(y_test,predict_LR))
print(confusion_matrix(y_test,predict_LR))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,predict_LR),3)*100,'%')

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
model.fit(X_train,y_train)

In [None]:
predict_svc = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
print(classification_report(y_test,predict_svc))
print(confusion_matrix(y_test,predict_svc))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,predict_svc),3)*100,'%')

In [None]:
accuracy_dtree = np.round(accuracy_score(y_test,predictions_dtree),4)*100
accuracy_rfc = np.round(accuracy_score(y_test,rfc_predict),4)*100
accuracy_LR = np.round(accuracy_score(y_test,predict_LR),4)*100
accuracy_svc = np.round(accuracy_score(y_test,predict_svc),4)*100


In [None]:
print('Accuracy_Decision_tree: ',accuracy_dtree,'%')
print('Accuracy_Random_Forest: ', accuracy_rfc,'%')
print('Accuracy_LR: ', accuracy_LR,'%')
print('Accuracy_SVC: ', accuracy_svc,'%')

# Our predictions worked very similarly and as we could see earlier and below, the number of people who survived were very few.
# The accuracy score worked differently for all predictors, being better ranked in SVC. Unfortunately, I think we will not survive unless we are of a nationality other than Scottish or Lithuanian.
# That's it for today guys. Don't forget to vote positive if you liked it, if you didn't like it please comment. Thank you!