<a href="https://colab.research.google.com/github/SandrAlzate/CV/blob/master/Titanic_Cleaning_Steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚢 Limpieza de Datos con Pandas (Dataset Titanic)

En este notebook trabajaremos paso a paso en la limpieza del dataset Titanic.

## Paso 1: Cargar librerías y dataset

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df_titanic = sns.load_dataset('titanic')
df_titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [None]:
df_titanic.shape

(891, 15)

In [None]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


## Paso 2: Revisar valores nulos

In [None]:
df_titanic.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [None]:
porcentaje_null = (df_titanic.isnull().sum() / len(df_titanic)) * 100
porcentaje_null

Unnamed: 0,0
survived,0.0
pclass,0.0
sex,0.0
age,13.520408
sibsp,0.0
parch,0.0
fare,0.0
embarked,0.255102
class,0.0
who,0.0


## Paso 6: Eliminar duplicados

In [None]:
duplicados = df_titanic.duplicated().sum()
duplicados

np.int64(4)

In [None]:
df_titanic = df_titanic.drop_duplicates()

In [None]:
df_titanic.shape

(784, 15)

## Paso 3: Eliminar columna con demasiados nulos (deck)

In [None]:
porcentaje_null = (df_titanic.isnull().sum() / len(df_titanic)) * 100
porcentaje_null

Unnamed: 0,0
survived,0.0
pclass,0.0
sex,0.0
age,13.520408
sibsp,0.0
parch,0.0
fare,0.0
embarked,0.255102
class,0.0
who,0.0


In [None]:
df_titanic = df_titanic.drop(columns=['deck'])

In [None]:
df_titanic.shape

(784, 14)

## Paso 4: Imputar valores faltantes en columna numérica (age)

In [None]:
df_titanic['age'] = df_titanic['age'].fillna(df_titanic['age'].mean())

In [None]:
df_titanic.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


## Paso 5: Imputar valores faltantes en columna categórica (embarked)

In [None]:
df_titanic['embarked'].value_counts()

Unnamed: 0_level_0,count
embarked,Unnamed: 1_level_1
S,568
C,155
Q,59


In [None]:
df_titanic['embarked'] = df_titanic['embarked'].fillna(df_titanic['embarked'].mode()[0])
df_titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.000000,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,29.869351,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [None]:
df_titanic['embarked'].value_counts()

Unnamed: 0_level_0,count
embarked,Unnamed: 1_level_1
S,570
C,155
Q,59


In [None]:
df_titanic.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,0
class,0
who,0


## Paso 7: Normalizar valores de texto en 'sex'

## Paso 8: Crear nueva variable 'familia'

## Paso 9: Dataset limpio final