# Supervivencia en el Titanic

## 1.1 Cargando los datos

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("titanic.csv")

## 1.2 Estudiando los datos

In [3]:
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [4]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


## 1.3 Preprocessing

Primero haremos one hot encoding para "Sex"

In [5]:
def f(x):
    if x == 'male':
        return 1
    else:
        return 0

In [6]:
print(f('male'))

1


In [7]:
print(f('female'))

0


In [8]:
df['Sex'] = df['Sex'].apply(f)

In [9]:
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,1,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,0,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,1,35.0,0,0,8.05


Eliminar los outliers en Fare (Valores Atípicos)

In [10]:
df = df[df['Fare'] < 200]

In [11]:
df = df.reset_index(drop = True)

In [12]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,867.0,867.0,867.0,867.0,867.0,867.0,867.0
mean,0.378316,2.33564,0.651672,29.395813,0.520185,0.367935,26.60754
std,0.485247,0.822131,0.476715,14.141065,1.103426,0.793761,29.395841
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.0,0.0,0.0,7.8958
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,30.0
max,1.0,3.0,1.0,80.0,8.0,6.0,164.8667


In [13]:
X = df[['Sex', 'Age', 'Fare']].values
Y = df['Survived'].values

In [14]:
X

array([[ 1.    , 22.    ,  7.25  ],
       [ 0.    , 38.    , 71.2833],
       [ 0.    , 26.    ,  7.925 ],
       ...,
       [ 0.    ,  7.    , 23.45  ],
       [ 1.    , 26.    , 30.    ],
       [ 1.    , 32.    ,  7.75  ]])

In [15]:
Y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,

## 1.4 Training and Testing

In [16]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

In [17]:
X_train

array([[ 1.    , 29.    ,  7.8958],
       [ 0.    , 45.    , 26.25  ],
       [ 0.    , 18.    ,  7.75  ],
       ...,
       [ 1.    , 26.    ,  8.6625],
       [ 1.    , 37.    , 42.4   ],
       [ 0.    , 21.    ,  7.75  ]])

## 1.5 Entrenamos el modelo

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [20]:
print(model.predict([[0, 5, 100]]))

[1]


In [21]:
print(model.predict_proba([[0, 5, 100]]))

[[0.1026366 0.8973634]]


In [22]:
print(model.predict_proba([[1, 45, 1]]))

[[0.87831056 0.12168944]]


## 1.6 Evaluando el Modelo

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
Y_predict = model.predict(X_test)

In [25]:
print(accuracy_score(Y_predict, Y_test))

0.8045977011494253


In [26]:
from sklearn.metrics import confusion_matrix

In [27]:
print(confusion_matrix(Y_predict, Y_test))

[[129  31]
 [ 20  81]]


## 1.7 Guardar el Modelo

In [28]:
import pickle

In [29]:
pickle.dump(model, open('mymodel.sav', 'wb'))

In [30]:
modelo_importado = pickle.load(open('mymodel.sav', 'rb'))