In [2]:
import polars as pl
from polars import col as c

pl_mkt = pl.read_csv('data/marketing.csv')
print(pl_mkt)

shape: (200, 4)
┌─────────┬──────────┬───────────┬───────┐
│ youtube ┆ facebook ┆ newspaper ┆ sales │
│ ---     ┆ ---      ┆ ---       ┆ ---   │
│ f64     ┆ f64      ┆ f64       ┆ f64   │
╞═════════╪══════════╪═══════════╪═══════╡
│ 276.12  ┆ 45.36    ┆ 83.04     ┆ 26.52 │
│ 53.4    ┆ 47.16    ┆ 54.12     ┆ 12.48 │
│ 20.64   ┆ 55.08    ┆ 83.16     ┆ 11.16 │
│ 181.8   ┆ 49.56    ┆ 70.2      ┆ 22.2  │
│ 216.96  ┆ 12.96    ┆ 70.08     ┆ 15.48 │
│ …       ┆ …        ┆ …         ┆ …     │
│ 45.84   ┆ 4.44     ┆ 16.56     ┆ 9.12  │
│ 113.04  ┆ 5.88     ┆ 9.72      ┆ 11.64 │
│ 212.4   ┆ 11.16    ┆ 7.68      ┆ 15.36 │
│ 340.32  ┆ 50.4     ┆ 79.44     ┆ 30.6  │
│ 278.52  ┆ 10.32    ┆ 10.44     ┆ 16.08 │
└─────────┴──────────┴───────────┴───────┘


# <span style="color:brown;">Feature Selection</span>

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

k_best = SelectKBest(score_func=f_regression, k='all').fit(
    X = pl_mkt.drop('sales'), 
    y = pl_mkt['sales']
)

pl_k_best_scores = (
    pl.DataFrame({
        'Feature_Name': pl_mkt.drop('sales').columns,
        'Score': k_best.scores_
    })
    .sort(by = 'Score', descending = True)
    .with_columns(
        Cumulative_Percentage = c('Score').cum_sum() / c('Score').sum()
    )
)

print(pl_k_best_scores)

shape: (3, 3)
┌──────────────┬────────────┬───────────────────────┐
│ Feature_Name ┆ Score      ┆ Cumulative_Percentage │
│ ---          ┆ ---        ┆ ---                   │
│ str          ┆ f64        ┆ f64                   │
╞══════════════╪════════════╪═══════════════════════╡
│ youtube      ┆ 312.144994 ┆ 0.740639              │
│ facebook     ┆ 98.421588  ┆ 0.974167              │
│ newspaper    ┆ 10.887299  ┆ 1.0                   │
└──────────────┴────────────┴───────────────────────┘


# <span style="color:brown;">Simple Linear Regression</span>

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(
    pl_mkt.select('youtube'), 
    pl_mkt['sales'], 
    test_size=0.2, 
    random_state=1
)


# Train model
lm_yt_sales = LinearRegression().fit(x_train, y_train)

# Test on test set
print(lm_yt_sales.score(x_test, y_test))

0.41535307148347855


# <span style="color:brown;">Multiple Linear Regression</span>

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(
    pl_mkt.select('youtube', 'facebook'), 
    pl_mkt['sales'], 
    test_size=0.2, 
    random_state=1
)


# Train model
lm_yt_fb_sales = LinearRegression().fit(x_train, y_train)

# Test on test set
print(lm_yt_fb_sales.score(x_test, y_test))

0.8947344950027067


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(
    pl_mkt.drop('sales'), 
    pl_mkt['sales'], 
    test_size=0.2, 
    random_state=1
)


# Train model
lm_yt_fb_np_sales = LinearRegression().fit(x_train, y_train)

# Test on test set
print(lm_yt_fb_np_sales.score(x_test, y_test))

0.8927605914615384
