In [21]:
import polars as pl
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [22]:
def clean_horsepower_column(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.with_columns(pl.col('horsepower')
                           .replace('?', None)
                           .cast(pl.Float64)
                           .fill_null(strategy='mean'))

In [23]:
def build_columns(ldf: pl.LazyFrame, col: str) -> tuple[pl.DataFrame, pl.DataFrame]:
    df = ldf.collect()
    df = df.with_columns(pl.col(col).fill_nan(None).fill_null(strategy='mean'))
    X = df[[col]]
    y = ldf.select('mpg').collect().fill_null(strategy='mean')
    #print(X.describe(), y.describe())
    return X, y

In [24]:
def build_and_fit_model(X: pl.DataFrame, y: pl.DataFrame, cat: str):
    model = LinearRegression()
    model.fit(X, y)
    y_preds = model.predict(X)
    print(f'R2 score for category {cat}: {model.score(X, y)}')

In [25]:
cats = ["horsepower", "weight", "acceleration", "displacement", "cylinders", 'origin']
ldf = pl.scan_csv('data/cars.csv', infer_schema_length=200)
for cat in cats:
    cleaned_ldf = clean_horsepower_column(ldf)
    X, y = build_columns(cleaned_ldf, col=cat)
    build_and_fit_model(X, y, cat)

R2 score for category horsepower: 0.595115253260946
R2 score for category weight: 0.6917929800341573
R2 score for category acceleration: 0.17664276963558911
R2 score for category displacement: 0.6467421834257859
R2 score for category cylinders: 0.6012393994439937
R2 score for category origin: 0.3174763079292734
