In [1]:
# --- 1. Importación de librerías ---
import pandas as pd
import os

# --- 2. Configuración de rutas ---
# Detecta automáticamente si está en Kaggle o en entorno local
if os.path.exists("/kaggle/working"):
    RAW_PATH = "/kaggle/input/titanic/test.csv"
    PROCESSED_PATH = "/kaggle/working/test_clean.csv"
else:
    RAW_PATH = "../data/raw/test.csv"
    PROCESSED_PATH = "../data/processed/test_clean.csv"

# --- 3. Cargar dataset ---
df = pd.read_csv(RAW_PATH)
print("✅ Dataset cargado correctamente\n")
print("Dimensiones iniciales:", df.shape)
display(df.head())

# --- 4. Inspección inicial ---
print("\nTipos de datos:")
print(df.dtypes)

print("\nValores nulos iniciales:")
print(df.isnull().sum())

# --- 5. Limpieza de datos ---
# Rellenar edad con la mediana
df["Age"] = df["Age"].fillna(df["Age"].median())

# Rellenar tarifa (Fare) con la mediana
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

# Rellenar puerto de embarque con el más frecuente
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# Eliminar columna Cabin (demasiados nulos)
if "Cabin" in df.columns:
    df = df.drop(columns=["Cabin"])

# --- 6. Verificación final ---
print("\nValores nulos restantes:")
print(df.isnull().sum())

print("\nDimensiones finales:", df.shape)

# --- 7. Guardar dataset limpio ---
os.makedirs(os.path.dirname(PROCESSED_PATH), exist_ok=True)
df.to_csv(PROCESSED_PATH, index=False)
print(f"\n✅ Archivo limpio guardado en {PROCESSED_PATH}")

✅ Dataset cargado correctamente

Dimensiones iniciales: (418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S



Tipos de datos:
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Valores nulos iniciales:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Valores nulos restantes:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Dimensiones finales: (418, 10)

✅ Archivo limpio guardado en /kaggle/working/test_clean.csv
