In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

%matplotlib inline

# LIMPIEZA DE DATOS

Creacion de un diccionario donde se recogerán todos los cambios que se le harán al dataset inicial

In [2]:
df = pd.read_csv(r'.\archivos\titanic\train.csv')
hist_dataset = {'initial' : df}

In [3]:
def p_NaN_dict(hist_dict, key):
       
    data_return =  hist_dict[key].isna().sum(axis=0) / len(hist_dict[key])
    columns = hist_dataset[key].columns

    print(
        'Tamaño del dataset', 
        '\n------------------', 
        f'\n{hist_dataset[key].shape}'
    )
    print(
        '\nPorcentaje de valores nulos'
        '\n---------------------------'
    )
    i = 0
    for column in columns:
        if data_return[column] != .0:
            print(f'{column:18} {data_return[column]:.6f}')
            i = 1
    if i == 0: print('- No hay elementos nulos -')


In [4]:
p_NaN_dict(hist_dataset, 'initial')

Tamaño del dataset 
------------------ 
(891, 12)

Porcentaje de valores nulos
---------------------------
Age                0.198653
Cabin              0.771044
Embarked           0.002245


## Comprobación de la calidad de las variables

Se procede a la búsqueda de valores perdidos y a la eliminación de las variables (columnas) con un porcentage de valores NaN mayor al porporcionado por el usuario.

In [5]:
p_NaN_dict(hist_dataset, 'initial')

Tamaño del dataset 
------------------ 
(891, 12)

Porcentaje de valores nulos
---------------------------
Age                0.198653
Cabin              0.771044
Embarked           0.002245


In [6]:
def clean_CNaN(
    data,
    columns = None, 
    p = 0.7, 
    hist_dataset = None):
    
    if columns == None: columns = data.columns
        
    df = data[columns]

    serie = df.isna().sum(axis=0) / len(df)
    columns_to_drop = [ column for column in serie.index if serie[column] > p ]
    
    data_return = data.drop(columns_to_drop, axis = 1)

    if hist_dataset != None: hist_dataset[ 'clean_CNaN' ] = data_return

    return data_return
    

In [7]:
df = hist_dataset['initial']
columns = df.columns
df = clean_CNaN(df, p = 0.6, hist_dataset = hist_dataset)
p_NaN_dict(hist_dataset, 'clean_CNaN')

Tamaño del dataset 
------------------ 
(891, 11)

Porcentaje de valores nulos
---------------------------
Age                0.198653
Embarked           0.002245


Ahora se procede a la eliminación de las observaciones que contengan valores NaN en las colunmas que sean relevantes para posibles calculos o predicciones

In [8]:
def clean_NaN(
    data, 
    columns = None,
    hist_dataset = None):
    
    if columns == None: columns = data.columns

    data_return = data.dropna(subset = columns)
    #for column in columns:
    #    data_return[column] = df[df.column.notna()]
    
    if hist_dataset != None: hist_dataset[ 'clean_NaN' ] = data_return

    return data_return


In [9]:
df = hist_dataset['clean_CNaN']
df = clean_NaN(df, columns = ['Age'], hist_dataset = hist_dataset)
p_NaN_dict(hist_dataset, 'clean_NaN')

Tamaño del dataset 
------------------ 
(714, 11)

Porcentaje de valores nulos
---------------------------
Embarked           0.002801


## Outliers

Eliminar los valores atímpicos, se entablecerá intervalo donde los datos se considerarán válidos. Se aplicará para todas las variables no categóricas.

In [10]:
def __RI__(serie):
    Q1 = np.percentile(serie, q = 25)
    Q3 = np.percentile(serie, q = 75)
    R = Q3 - Q1
    lim_inf = Q1 - 1.5*R
    lim_sup = Q3 + 1.5*R

    if serie.dtype == 'float64':
        lim_inf = round(np.float64(lim_inf), 4)
        lim_sup = round(np.float64(lim_sup), 4)
    if serie.dtype == 'int64':
        lim_inf = np.int64(round(lim_inf, 0))
        lim_sup = np.int64(round(lim_sup, 0))

    return lim_inf, lim_sup

def clean_Outliers(
    data,
    columns = None,
    hist_dataset = None):

    if columns == None: columns = data.columns
    df = data

    for column in columns:
        lim_inf, lim_sup = __RI__(df[column])

        print('Intervalo de', column, ':', lim_inf, lim_sup)

    print('')
    
    for column in columns:
        for i in df[column].index:
            if df[column][i] <= lim_inf or df[column][i] >= lim_sup:
                df = df.drop([i], axis = 0)

    data_return = df
    if hist_dataset != None: hist_dataset['clean_Outliers'] = data_return
    return data_return

In [11]:
columns = ['Age', 'SibSp', 'Parch', 'Fare']
df = hist_dataset['clean_NaN']
df = clean_Outliers(df, columns = columns, hist_dataset = hist_dataset)
p_NaN_dict(hist_dataset, 'clean_Outliers')


Intervalo de Age : -6.6875 64.8125
Intervalo de SibSp : -2 2
Intervalo de Parch : -2 2
Intervalo de Fare : -29.9375 71.3625

Tamaño del dataset 
------------------ 
(618, 11)

Porcentaje de valores nulos
---------------------------
- No hay elementos nulos -


In [12]:
hist_dataset.keys()

dict_keys(['initial', 'clean_CNaN', 'clean_NaN', 'clean_Outliers'])

## Variables **One Hot** o **Dummies**

Transformación de variables categóricas en variables dummies, con el objetivo de tener un mayor número de variables con las que entrenar nuestro modelo de predicción.

Primero inspeccionamos los tipos de variables que tenemos y desechamos las que no nos servirán en nuestro modelo.

In [13]:
hist_dataset['clean_Outliers'].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Eliminamos las variables que no dan valor a nuestro análisis y modelos. PassengerID, Name y Ticket son datos específicos a cada pasajero que no dan un valor estadístico.

In [14]:
v_no_stats = ["PassengerId", "Name", "Ticket"]
hist_dataset['statistical-data'] = hist_dataset['clean_Outliers'].drop(v_no_stats, axis=1)

In [15]:
hist_dataset['statistical-data'].head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Realizamos un conteo de los valores de cada columna

In [16]:
df = hist_dataset['statistical-data']
vars_disc = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch','Embarked']
for i in vars_disc:
    print(df[i].value_counts())

0    397
1    221
Name: Survived, dtype: int64
3    354
2    168
1     96
Name: Pclass, dtype: int64
male      413
female    205
Name: Sex, dtype: int64
0    425
1    143
2     18
4     18
3      9
5      5
Name: SibSp, dtype: int64
0    463
1     91
2     50
3      5
5      5
4      3
6      1
Name: Parch, dtype: int64
S    505
C     87
Q     26
Name: Embarked, dtype: int64


In [17]:
def cat_to_dummy(
    data,
    columns = None,
    hist_dataset = None):

    if columns == None: columns = data.columns
    df = data 

    for column in columns:
        categorias = df[column].value_counts().index

        for categoria in categorias:
            name = str(column) + '-' + str(categoria)
            df[name] = np.where(df[column] == categoria, 1, 0)

        df = df.drop([column], axis = 1)
    
    data_return = df
    if hist_dataset != None: hist_dataset['cat_to_dummy'] = data_return
    return data_return

In [18]:
df = hist_dataset['statistical-data']
vars_cat = ['Pclass', 'Sex', 'Embarked']
df = cat_to_dummy(df, columns=vars_cat, hist_dataset=hist_dataset)

In [19]:
hist_dataset['cat_to_dummy'].head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass-3,Pclass-2,Pclass-1,Sex-male,Sex-female,Embarked-S,Embarked-C,Embarked-Q
0,0,22.0,1,0,7.25,1,0,0,1,0,1,0,0
1,1,38.0,1,0,71.2833,0,0,1,0,1,0,1,0
2,1,26.0,0,0,7.925,1,0,0,0,1,1,0,0
3,1,35.0,1,0,53.1,0,0,1,0,1,1,0,0
4,0,35.0,0,0,8.05,1,0,0,1,0,1,0,0


# Entrenamiento y predicción de supervivientes

Para evitar poroblemas de multicolinialidad eliminamos ciertas variables dummies. Además, fusionamos las variables  'SubSp' y 'Parch' en una variable llamada 'family_size'

In [20]:
df = hist_dataset['cat_to_dummy']
df['family_size'] = df['SibSp'] + df['Parch']
df = df.drop(['Pclass-3', 'Sex-male', 'Embarked-C'], axis = 1)

hist_dataset['Clean_Analysis'] = df

In [21]:
df = hist_dataset['Clean_Analysis']

In [22]:
y = df['Survived']
x = df[['Age', 'SibSp', 'Parch', 'Fare', 'Pclass-1', 'Pclass-2', 
        'Sex-female','Embarked-S', 'Embarked-Q', 'family_size']]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=1)

In [24]:
clf = [
    (LogisticRegression(), 'reg-log'),
    (DecisionTreeClassifier(), 'arbol-class')
]

In [25]:
for model, name in clf:
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    out = (
    f'''    {name}
    Resultado de la prueba del modelo: {round(accuracy*100, 2)}%
    '''
    )
    print(out)

    reg-log
    Resultado de la prueba del modelo: 78.49%
    
    arbol-class
    Resultado de la prueba del modelo: 75.27%
    
