# Titanic - Solución Rápida y Efectiva

In [1]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

## Cargar datos

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train: {train.shape}, Test: {test.shape}")
print(f"\nTasa de supervivencia: {train['Survived'].mean():.2%}")
train.head()

Train: (891, 12), Test: (418, 11)

Tasa de supervivencia: 38.38%


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Exploración rápida

In [4]:
print("Valores faltantes:")
print(train.isnull().sum())
print("\nSupervivencia por género:")
print(train.groupby('Sex')['Survived'].mean())
print("\nSupervivencia por clase:")
print(train.groupby('Pclass')['Survived'].mean())

Valores faltantes:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Supervivencia por género:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Supervivencia por clase:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64


## Preparación de datos

In [5]:
def preparar_datos(df):
    df = df.copy()
    
    # Rellenar Age con la mediana
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    # Rellenar Embarked con la moda
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Rellenar Fare con la mediana
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    # Crear nuevas características
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Convertir Sex a numérico
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    # Convertir Embarked a numérico
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return df

train_prep = preparar_datos(train)
test_prep = preparar_datos(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

## Seleccionar características

In [6]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']

X = train_prep[features]
y = train_prep['Survived']
X_test = test_prep[features]

print(f"Características seleccionadas: {features}")

Características seleccionadas: ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']


## Entrenar modelo

In [7]:
modelo = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Validación cruzada
scores = cross_val_score(modelo, X, y, cv=5)
print(f"Accuracy promedio (CV): {scores.mean():.4f} (+/- {scores.std():.4f})")

# Entrenar con todos los datos
modelo.fit(X, y)
print("\nModelo entrenado exitosamente!")

Accuracy promedio (CV): 0.8115 (+/- 0.0223)

Modelo entrenado exitosamente!


## Generar predicciones

In [None]:
predicciones = modelo.predict(X_test)

# Crear archivo de submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predicciones
})

submission.to_csv('submission.csv', index=False)

print("Archivo 'submission.csv' generado exitosamente!")
print(f"\nPredicciones de supervivencia: {predicciones.sum()} de {len(predicciones)}")
submission.head(10)

Archivo 'submission.csv' generado exitosamente!

Predicciones de supervivencia: 140 de 418


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


## RESULTADO DE ESTE PRIMER INTETO ##
Mi resultado fue de un public score de 0.784, es decir, predije el resultado de supervivencia correctamente para el 78% de las personas.

Quedé en el puesto 2480.

Probamos ahora algo nuevo. Importamos algunos métodos de sci-kit learn y hacemos un análisis de característica más profundo (por ejemplo, miramos al título acompañando el nombre, por ejemplo, Mrs VS Miss).

In [15]:
# Preparación ultra simple - solo lo esencial
train_simple = train.copy()
test_simple = test.copy()

# Rellenar valores faltantes de forma simple
train_simple['Age'].fillna(train_simple['Age'].median(), inplace=True)
test_simple['Age'].fillna(test_simple['Age'].median(), inplace=True)
train_simple['Embarked'].fillna('S', inplace=True)
test_simple['Fare'].fillna(test_simple['Fare'].median(), inplace=True)

# Solo convertir Sex
train_simple['Sex'] = train_simple['Sex'].map({'male': 0, 'female': 1})
test_simple['Sex'] = test_simple['Sex'].map({'male': 0, 'female': 1})

# Solo las 4 features más importantes
features_simple = ['Pclass', 'Sex', 'Age', 'Fare']

X_simple = train_simple[features_simple]
y_simple = train_simple['Survived']
X_test_simple = test_simple[features_simple]

# RandomForest básico pero robusto
from sklearn.ensemble import RandomForestClassifier

modelo_simple = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=10, random_state=42)

scores_simple = cross_val_score(modelo_simple, X_simple, y_simple, cv=5)
print(f"CV Score: {scores_simple.mean():.4f}")

modelo_simple.fit(X_simple, y_simple)
pred = modelo_simple.predict(X_test_simple)




# Crear archivo de submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': pred
})

submission.to_csv('submission2.csv', index=False)

print("Archivo 'submission2.csv' generado exitosamente!")
print(f"\nPredicciones de supervivencia: {pred.sum()} de {len(pred)}")
submission.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_simple['Age'].fillna(train_simple['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_simple['Age'].fillna(test_simple['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

CV Score: 0.8182
Archivo 'submission2.csv' generado exitosamente!

Predicciones de supervivencia: 136 de 418


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


## Resultado de este segundo intento ##

El modelo empeoró, bajo el puntaje a 0.77, probaremos de nuevo.

## TERCER INTENTO ##
No fui cuidadoso de normalizar los títulos antes de usarlos en el intento 2. Probé varios métodos que encontré online, como usar un grid de forest, comparando sus puntakes en 5-folds, pero aún así el puntaje no hizo más que bajar, por lo que volvimos al modelo inicial, aún así, el puntaje final obtenido es menor a 0,78.

In [26]:
# --- MODULES ---
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# --- DATA WRANGLING ---
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
passengerId = test.PassengerId

titanic = pd.concat([train, test], ignore_index=True)
train_idx = len(train)
test_idx = len(titanic) - len(test)

# --- FEATURE ENGINEERING ---
titanic['Title'] = titanic.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

standardized_titles = {
    "Capt": "Officer", "Col": "Officer", "Major": "Officer",
    "Jonkheer": "Royalty", "Don": "Royalty", "Sir": "Royalty",
    "Dr": "Officer", "Rev": "Officer", "the Countess": "Royalty",
    "Dona": "Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs",
    "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master", "Lady": "Royalty"
}
titanic['Title'] = titanic['Title'].map(standardized_titles)

grouped = titanic.groupby(['Sex', 'Pclass', 'Title'])
titanic['Age'] = grouped['Age'].transform(lambda x: x.fillna(x.median()))


titanic['Cabin'] = titanic['Cabin'].fillna('U')
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].value_counts().index[0])
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

titanic['FamilySize'] = titanic['Parch'] + titanic['SibSp'] + 1
titanic['Cabin'] = titanic['Cabin'].map(lambda x: x[0])

# --- DUMMIES AND CLEANUP ---
titanic['Sex'] = titanic['Sex'].map({"male": 0, "female": 1})
pclass_dummies = pd.get_dummies(titanic['Pclass'], prefix="Pclass")
title_dummies = pd.get_dummies(titanic['Title'], prefix="Title")
cabin_dummies = pd.get_dummies(titanic['Cabin'], prefix="Cabin")
embarked_dummies = pd.get_dummies(titanic['Embarked'], prefix="Embarked")

titanic_dummies = pd.concat(
    [titanic, pclass_dummies, title_dummies, cabin_dummies, embarked_dummies], axis=1
)
titanic_dummies.drop(['Pclass', 'Title', 'Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

# --- TRAIN / TEST SPLIT ---
train = titanic_dummies[:train_idx]
test = titanic_dummies[test_idx:]

train['Survived'] = train['Survived'].astype(int)
X = train.drop('Survived', axis=1).values
y = train['Survived'].values
X_test = test.drop('Survived', axis=1).values

# --- MODELING ---
modelo = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
modelo.fit(X, y)


# --- PREDICTION ---
forrest_pred = modelo.predict(X_test)

# --- Guardar submission ---
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': forrest_pred})
submission.to_csv('submission3.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Survived'] = train['Survived'].astype(int)


## RESULTADO TERCER INTENTO ##
El puntaje bajo a 0,77. Aún así quiero probar a usar menos variables y un modelo más sencillo, aunque sea solo para comprar.

In [27]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# cargar datos
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
passengerId = test.PassengerId

# variables simples
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

X = train[['Sex', 'Pclass']]
y = train['Survived']
X_test = test[['Sex', 'Pclass']]

# modelo simple
modelo = LogisticRegression()
modelo.fit(X, y)

# predicción
pred = modelo.predict(X_test)

# guardar resultado
pd.DataFrame({'PassengerId': passengerId, 'Survived': pred}).to_csv('simple_pred.csv', index=False)


## Resultados Cuarto Intento ##
El puntaje bajó a 0.76, con lo que se me acabaron las submissions y no puedo seguir probando puntuaciones.

Moraleja: A veces un modelo inicial de ClassificationTree puede ser el más adecuado, antes un modelo complicado o una regresión lineal.