In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import pickle, pathlib

#Import data
df = pd.read_csv("data/train.csv")

#Select columns to use
columns = ["Survived", "Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "Embarked", "Name"]
df = df[columns]

#Extract title with regex
df["Title"] = df["Name"].str.extract(r",\s*([^\.]*)\s*\.", expand=False)

#Group titles
df["Title"] = df["Title"].replace({
    "Mlle": "Miss",
    "Ms": "Miss",
    "Mme": "Mrs",
    "Dr": "Rare", "Rev": "Rare", "Col": "Rare", "Major": "Rare", "Capt": "Rare", "Jonkheer": "Rare",
    "Sir": "Royalty", "Lady": "Royalty", "Countess": "Royalty", "Don": "Royalty", "Dona": "Royalty"
})

#Set null values with median
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

#Set null value with S (Comun)
df["Embarked"] = df["Embarked"].fillna("S")

#What we want to predict
y = df["Survived"]

X = df.drop(columns=["Survived", "Name"])

numeric_features = ["Age", "Fare", "SibSp", "Parch"]
categorical_features = ["Pclass", "Sex", "Embarked", "Title"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(),numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", RandomForestClassifier(random_state=42))
    ]
)

pipeline.fit(X, y)

pathlib.Path("./").joinpath("model_advanced.pkl").write_bytes(pickle.dumps(pipeline))





2972717