# Homework 02


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd
import polars as pl

from ml_zoomcamp.regression import (
    prepare_X,
    rmse,
    train_linear_regression,
    train_val_test_split,
)
from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

In [2]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)


### Getting the data


In [3]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

#### Cleanup Columns


In [4]:
base = ["ram", "storage", "screen", "final_price"]

In [5]:
df = df.select(pl.col(base))

## EDA


In [6]:
print(f"Skewness: {df.get_column("final_price").skew()}")
print(f"Kurtosis: {df.get_column("final_price").kurtosis()}")

Skewness: 1.6484419982102287
Kurtosis: 3.4306207518398484


In [7]:
alt.Chart(df).mark_bar().encode(
    alt.X("final_price:Q", bin=alt.BinParams(maxbins=50)),
    alt.Y("count()"),
)

In [8]:
alt.Chart(df.with_columns(pl.col("final_price").log1p())).mark_bar().encode(
    alt.X("final_price:Q", bin=alt.BinParams(maxbins=50)),
    alt.Y("count()"),
)

Since `final_price` has a positive skewness with a kurtosis value of above 3, `final_price` has a long tail


### 1. Column with missing values


In [9]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
)

column,null_count
str,u32
"""screen""",4


### 2. Median for variable `ram`


In [10]:
df.get_column("ram").median()

16.0

## Setting Up Validation Framework


In [11]:
df_train, df_val, df_test, y_train, y_val, y_test = train_val_test_split(
    df, "final_price"
)

## 3. Dealing with missing values


In [12]:
screen_mean = df_train.get_column("screen").mean()
screen_mean

15.163534416086618

Imputing with 0


In [13]:
X_train = prepare_X(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, 0)
y_pred = w0 + X_val @ w

round(rmse(y_val, y_pred), 2)

np.float64(597.36)

Imputing with the mean of screen


In [14]:
X_train = prepare_X(df_train, screen_mean)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, screen_mean)
y_pred = w0 + X_val @ w

round(rmse(y_val, y_pred), 2)

np.float64(600.27)

Imputing with 0 resulted in a lower RMSE


## 4. Regularized linear regression


In [15]:
list_r = [0, 0.01, 0.1, 1, 5, 10, 100]
for r in list_r:
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train, r=r)

    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val @ w

    score = round(rmse(y_val, y_pred), 2)

    print(f"{r:<4}: {score}")


0   : 597.36
0.01: 597.36
0.1 : 597.35
1   : 597.21
5   : 597.01
10  : 597.06
100 : 597.9


## 5. Testing different seed values


In [16]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmses = []
for seed in seeds:
    df_train, df_val, df_test, y_train, y_val, y_test = train_val_test_split(
        df, "final_price", seed
    )

    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val @ w

    score = rmse(y_val, y_pred)
    rmses.append(score)

score_std = np.std(rmses)
round(score_std, 3)

np.float64(29.176)

## 6. Testing different seed values


In [17]:
df_train, df_val, df_test, y_train, y_val, y_test = train_val_test_split(
    df, "final_price", 9
)

df_full_train = df_train.vstack(df_val)
y_full_train = np.concat([y_train, y_val])

X_full_train = prepare_X(df_full_train, 0)
w0, w = train_linear_regression(X_full_train, y_full_train, r=0.001)

X_test = prepare_X(df_test, 0)
y_pred = w0 + X_test @ w

rmse(y_test, y_pred)

np.float64(608.609982204956)