In [1]:
import polars as pl
from sklearn.decomposition import PCA

df = pl.read_csv("../../Data/Titanic/train.csv")

In [2]:
def standardize(column):
    return (column - column.mean()) / column.std()

df = df.with_columns(
    (pl.col("Parch") + pl.col("SibSp") + 1).alias("FamilySize"),
    pl.col("Fare").fill_null(pl.col("Fare").median()).alias("Fare"),
    pl.col("Age").fill_null(pl.col("Age").median()).alias("Age"),
    pl.when(pl.col("Age") < 16).then(1).otherwise(0).alias("IsChild"),
    pl.when(pl.col("Cabin").is_not_null()).then(1).otherwise(0).alias("HasCabinRegistered"),
    ).with_columns(
        pl.col("Fare").log1p().alias("Fare"),
        pl.when((pl.col("FamilySize") >= 2) & (pl.col("FamilySize") <= 4)).then(1).otherwise(0).alias("HasSmallFamily"),
        ).to_dummies([
            "Sex", 
            "Embarked"
            ]).drop([
                "Sex_male",
                "Parch",
                "SibSp",
                "Name",
                "Ticket",
                "Cabin",
                "Embarked_S",
                "Embarked_Q",
                "Embarked_null",
                "FamilySize"
                ], 
                strict=False
                ).with_columns(
                    standardize(pl.col("Fare")).alias("Fare"),
                    standardize(pl.col("Age")).alias("Age")
                )

pca = PCA().fit_transform(df["Pclass", "HasCabinRegistered", "Fare"])

df = df.with_columns(
    pl.lit(pca[:, 0]).alias("PC1"),
    pl.lit(pca[:, 1]).alias("PC2"),
    pl.lit(pca[:, 2]).alias("PC3")
).drop("Pclass", "HasCabinRegistered", "Fare")

In [3]:
df.describe()

statistic,PassengerId,Survived,Sex_female,Age,Embarked_C,IsChild,HasSmallFamily,PC1,PC2,PC3
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",446.0,0.383838,0.352413,2.5519e-16,0.188552,0.093154,0.327722,3.1899e-16,3.5089e-16,1.3557e-16
"""std""",257.353842,0.486592,0.47799,1.0,0.391372,0.290811,0.469646,1.228698,0.541612,0.269641
"""min""",1.0,0.0,0.0,-2.222908,0.0,0.0,0.0,-2.800219,-3.117542,-0.753444
"""25%""",224.0,0.0,0.0,-0.565419,0.0,0.0,0.0,-1.078463,-0.223943,0.018771
"""50%""",446.0,0.0,0.0,-0.104579,0.0,0.0,0.0,-0.281312,0.033323,0.052358
"""75%""",669.0,1.0,1.0,0.433068,0.0,0.0,1.0,0.699211,0.167498,0.054718
"""max""",891.0,1.0,1.0,3.88937,1.0,1.0,1.0,3.55271,1.418979,0.985163


In [4]:
df.write_csv("../../Data/Titanic/advancedTrain.csv")

df.head()

PassengerId,Survived,Sex_female,Age,Embarked_C,IsChild,HasSmallFamily,PC1,PC2,PC3
i64,i64,u8,f64,u8,i32,i32,f64,f64,f64
1,0,0,-0.565419,0,0,1,-1.140324,-0.013866,0.054224
2,1,1,0.663488,1,0,1,2.010718,-0.259612,0.217963
3,1,1,-0.258192,0,0,0,-1.078463,0.03864,0.052638
4,1,1,0.433068,0,0,1,1.782793,-0.453069,0.223803
5,0,0,0.433068,0,0,0,-1.067523,0.047926,0.052358
