In [1]:
!pip install pandas matplotlib seaborn openpyxl




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Limpieza inicial del dataset Titanic
Vamos a explorar y preparar los datos antes de hacer cualquier análisis.

In [3]:
import pandas as pd

df = pd.read_csv("Titanic_dataset.csv")  # Ajusta si es .csv o .xlsx
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# 1. Ver las primeras filas
df.head()

# 2. Información general de columnas y tipos de datos
df.info()

# 3. Estadísticas descriptivas de las columnas numéricas
df.describe()

# 4. Cantidad de valores nulos por columna
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# --- Paso 1: Renombrar columnas a minúsculas para consistencia ---
df.columns = df.columns.str.lower()
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [6]:
# --- Paso 2: Eliminar columnas irrelevantes o con demasiados nulos ---
df = df.drop(['name', 'ticket', 'cabin'], axis=1)
df.head()

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [7]:
# --- Paso 3: Rellenar valores nulos ---
# Rellenar Age con la mediana
df['age'].fillna(df['age'].median(), inplace=True)

# Rellenar Embarked con el valor más frecuente
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Verificar que ya no haya nulos
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


passengerid    0
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
dtype: int64

In [8]:
# Convertir 'sex' en variable binaria: male=0, female=1
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

# Crear variables dummies para 'embarked'
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Verificar el resultado
df.head()

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,1,0,3,0,22.0,1,0,7.25,False,True
1,2,1,1,1,38.0,1,0,71.2833,False,False
2,3,1,3,1,26.0,0,0,7.925,False,True
3,4,1,1,1,35.0,1,0,53.1,False,True
4,5,0,3,0,35.0,0,0,8.05,False,True


In [11]:
from sklearn.preprocessing import StandardScaler

# 1️⃣ Variable objetivo
y = df['survived']   # lo que queremos predecir

# 2️⃣ Features (eliminamos 'survived' para que no se mezcle)
X = df.drop(columns=['survived'])

# 3️⃣ Escalar SOLO las columnas numéricas
scaler = StandardScaler()
X_scaled = X.copy()  # para no sobreescribir por si quieres comparar
num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
X_scaled[num_cols] = scaler.fit_transform(X_scaled[num_cols])

# Revisar las primeras filas para comprobar los cambios
X_scaled.head()

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,1,0.827377,0,-0.565736,0.432793,-0.473674,-0.502445,False,True
1,2,-1.566107,1,0.663861,0.432793,-0.473674,0.786845,False,False
2,3,0.827377,1,-0.258337,-0.474545,-0.473674,-0.488854,False,True
3,4,-1.566107,1,0.433312,0.432793,-0.473674,0.42073,False,True
4,5,0.827377,0,0.433312,-0.474545,-0.473674,-0.486337,False,True


In [12]:
from sklearn.model_selection import train_test_split

# Dividimos los datos: 80% entrenamiento, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Revisamos los tamaños de los datasets
print(f"Entrenamiento: {X_train.shape}, Test: {X_test.shape}")


Entrenamiento: (712, 9), Test: (179, 9)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1️⃣ Crear el modelo
model = RandomForestClassifier(random_state=42)

# 2️⃣ Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# 3️⃣ Predecir con los datos de test
y_pred = model.predict(X_test)

# 4️⃣ Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8379888268156425

Confusion Matrix:
 [[94 11]
 [18 56]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       105
           1       0.84      0.76      0.79        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



# --- Nota sobre resultados del Random Forest ---
# Accuracy: 0.838 → El modelo acierta aproximadamente un 83,8% de las veces.
# 
# Confusion Matrix:
# [[94 11]
#  [18 56]]
# Filas = valores reales, Columnas = predicciones
# - La mayoría de los errores vienen de sobrevivientes predichos como no sobrevivientes.
#
# Classification Report:
# - Precision: porcentaje de predicciones correctas por clase.
# - Recall: porcentaje de verdaderos positivos identificados.
# - F1-score: balance entre precision y recall.
# En nuestro caso:
# - Clase 0 (No sobrevivió): precision 0.84, recall 0.90 → el modelo identifica bien a los que no sobrevivieron.
# - Clase 1 (Sobrevivió): precision 0.84, recall 0.76 → algunos sobrevivientes se clasifican incorrectamente.

In [14]:
# Importancia de cada variable
importances = model.feature_importances_

# Asociamos los nombres de las columnas
feature_importances = pd.Series(importances, index=X_train.columns)

# Ordenamos de mayor a menor importancia
feature_importances.sort_values(ascending=False)


sex            0.264303
fare           0.196071
passengerid    0.190691
age            0.167558
pclass         0.080570
sibsp          0.041881
parch          0.028616
embarked_S     0.021430
embarked_Q     0.008880
dtype: float64