In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler



## Loading dataset

In [None]:
csv_path = "./data"

csv_path = Path(csv_path)
df_total = pl.read_csv(csv_path)

In [None]:
col_exp_vars = ["orientation", "density", "diameter"]
col_obj_vars = ["conductivity"]

df_exp = df_total.select(col_exp_vars)
df_obj = df_total.select(col_obj_vars)


df_total.head()

In [None]:
plt.figure(figsize=(12, 9))
sns.heatmap(
    df_total.to_pandas().corr(), annot=True, cmap="cividis", fmt=".2f", linewidths=0.5
)

## Data preprocessing

In [None]:
# Step 1: Exclusion of outliers
quantiles = df_exp.quantile(0.95)

df_prsd = df_total.filter(
    [pl.col(col) < quantiles.select(col) for col in col_exp_vars]
)
df_prsd

# Step 2: Convert DaraFrame to array
X = df_prsd.select(pl.exclude(col_obj_vars)).to_numpy()
y = df_prsd.select(col_obj_vars).to_numpy()

# Step 3: Separate into training data and test data
train_x, test_x, train_y, test_y = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Perform standardization
scaler = StandardScaler()
scaler.fit(train_x)
train_x_scaled = scaler.transform(train_x)

## Model definition, training, and value prediction

In [None]:
# Definition
model = LinearRegression()
# training
model.fit(train_x_scaled, train_y)
# Value prediction
y_pred = model.predict(train_x_scaled)

## Checking Partial Regression Coefficients
The following values is the partial regression coefficients of each item.

In [None]:
for xi, wi in zip(col_exp_vars, model.coef_[0]):
    print("{0:7}: {1:6.3f}".format(xi, wi))

Visualization using bar charts.

In [None]:
fig = plt.figure()
ax = fig.add_subplot()
ax.bar(
    col_exp_vars,
    model.coef_[0],
    edgecolor="black",
    facecolor="None",
    hatch="....."
)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.set_ylabel("Partial regression coefficient")
ax.set_xticks(range(len(col_exp_vars)), col_exp_vars, rotation=45)
ax.set_ylim(-1, 1)
ax.grid(which="both", axis="y")
ax.tick_params(axis="y", direction="in")
# ax.tick_params(axis="x", direction="in")

ax.set_axisbelow(True)

## Validate of the model
Refer to coefficient of determination ($R^2$) and MSE

If $R^2$ exceeds 0.5, it can be said that some trend has been obtained.

If the difference between the test data and the training data is large for MSE, it means that overfitting has occured and generalization performance is low.

In [None]:
r2 = model.score(train_x_scaled, train_y)
mse_train = mean_squared_error(train_y, y_pred)
y_pred_test = model.predict(scaler.transform(test_x))
mse_test = mean_squared_error(test_y, y_pred_test)
print("Coefficient of determination: ", r2)
print(f"Mean squared error of training data: {mse_train: 0.4f}")
print(f"Mean squared error of test data: {mse_test: 0.4f}")