In [4]:
import polars as pl
from sklearn.decomposition import PCA
from joblib import load
import os

df = pl.read_csv("../../Data/Titanic/test.csv")

In [5]:
def standardize(column):
    return (column - column.mean()) / column.std()

df = df.with_columns(
    (pl.col("Parch") + pl.col("SibSp") + 1).alias("FamilySize"),
    pl.col("Fare").fill_null(pl.col("Fare").median()).alias("Fare"),
    pl.col("Age").fill_null(pl.col("Age").median()).alias("Age"),
    pl.when(pl.col("Age") < 16).then(1).otherwise(0).alias("IsChild"),
    pl.when(pl.col("Cabin").is_not_null()).then(1).otherwise(0).alias("HasCabinRegistered"),
    ).with_columns(
        pl.col("Fare").log1p().alias("Fare"),
        pl.when((pl.col("FamilySize") >= 2) & (pl.col("FamilySize") <= 4)).then(1).otherwise(0).alias("HasSmallFamily"),
        ).to_dummies([
            "Sex", 
            "Embarked"
            ]).drop([
                "Sex_male",
                "Parch",
                "SibSp",
                "Name",
                "Ticket",
                "Cabin",
                "Embarked_S",
                "Embarked_Q",
                "Embarked_null",
                "FamilySize"
                ], 
                strict=False
                ).with_columns(
                    standardize(pl.col("Fare")).alias("Fare"),
                    standardize(pl.col("Age")).alias("Age")
                )

pca = PCA().fit_transform(df["Pclass", "HasCabinRegistered", "Fare"])

df = df.with_columns(
    pl.lit(pca[:, 0]).alias("PC1"),
    pl.lit(pca[:, 1]).alias("PC2"),
    pl.lit(pca[:, 2]).alias("PC3")
).drop("Pclass", "HasCabinRegistered", "Fare")

df.head()

PassengerId,Sex_female,Age,Embarked_C,IsChild,HasSmallFamily,PC1,PC2,PC3
i64,u8,f64,u8,i32,i32,f64,f64,f64
892,0,0.385769,0,0,0,-1.154067,0.018429,0.067418
893,1,1.369729,0,0,1,-1.231309,-0.047905,0.073466
894,0,2.550481,0,0,0,-0.397537,-0.587521,-0.248034
895,0,-0.204607,0,0,0,-1.083431,0.07909,0.061887
896,1,-0.598191,0,0,1,-0.833925,0.293363,0.042349


In [6]:
%store -r filename
model = load(f'../../Models/Titanic/{filename}')

index, X = df["PassengerId"], df.drop("PassengerId")
predictions = model.predict(X)

solution = pl.DataFrame({
    "PassengerId" : index,
    "Survived" : predictions
})

solution.write_csv("../../Data/Titanic/solution.csv")
print(filename[0:-7])

RandomForestClassifier__Sex_female__Age__Embarked_C__IsChild__HasSmallFamily__PC1__PC2__PC3
