# Лабораторная работа 6

**Тестирование качества работы моделей машинного обучения**

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import os

os.makedirs("lab5", exist_ok=True)

iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df["target"] = iris.target

# Создание трех "качественных" датасетов с зависимостями
df1 = df[df["target"] == 0].copy()
df1["y"] = df1["sepal length (cm)"] * 2.5 + np.random.normal(0, 0.1, size=len(df1))

df2 = df[df["target"] == 1].copy()
df2["y"] = df2["sepal width (cm)"] * -1.5 + 5 + np.random.normal(0, 0.1, size=len(df2))

df3 = df[df["target"] == 2].copy()
df3["y"] = df3["petal length (cm)"] * 1.2 + np.random.normal(0, 0.1, size=len(df3))

# Создание шумного датасета 
xs = df1["sepal length (cm)"].values
ys = xs + np.random.normal(0, 0.1, size=len(xs))

ys[25:35] *= 2

df_noise = df1.copy()
df_noise["y"] = ys

df1.to_csv("lab5/df1.csv", index=False)
df2.to_csv("lab5/df2.csv", index=False)
df3.to_csv("lab5/df3.csv", index=False)
df_noise.to_csv("lab5/df_noise.csv", index=False)


In [2]:
def evaluate_model(file_path, feature_col):
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    import pandas as pd

    df = pd.read_csv(file_path)
    X = df[[feature_col]]
    y = df["y"]

    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)

    r2 = r2_score(y, y_pred)
    mse = mean_squared_error(y, y_pred)

    print(f"Оценка модели на {file_path}")
    print(f"R²: {r2:.4f}")


In [3]:
evaluate_model("lab5/df1.csv", "sepal length (cm)")
evaluate_model("lab5/df2.csv", "sepal width (cm)")
evaluate_model("lab5/df3.csv", "petal length (cm)")
evaluate_model("lab5/df_noise.csv", "sepal length (cm)")

Оценка модели на lab5/df1.csv
R²: 0.9890
Оценка модели на lab5/df2.csv
R²: 0.9559
Оценка модели на lab5/df3.csv
R²: 0.9797
Оценка модели на lab5/df_noise.csv
R²: 0.0902


In [4]:
%%writefile lab5/test_lab5.py
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.datasets import load_iris

def load_and_test(file_path, feature_col="sepal length (cm)", r2_threshold=0.9):
    df = pd.read_csv(file_path)
    X = df[[feature_col]]
    y = df["y"]
    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    return r2

def test_df1():
    assert load_and_test("lab5/df1.csv") > 0.9

def test_df2():
    assert load_and_test("lab5/df2.csv", feature_col="sepal width (cm)", r2_threshold=0.85) > 0.9

def test_df3():
    assert load_and_test("lab5/df3.csv", feature_col="petal length (cm)", r2_threshold=0.85) > 0.9

def test_df_noise():
    assert load_and_test("lab5/df_noise.csv") > 0.9


Overwriting lab5/test_lab5.py


In [5]:
!pytest lab5/test_lab5.py -v

platform win32 -- Python 3.11.9, pytest-8.3.5, pluggy-1.6.0 -- C:\Users\stepa\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\stepa\Downloads
plugins: hydra-core-1.3.2
[1mcollecting ... [0mcollected 4 items

lab5/test_lab5.py::test_df1 [32mPASSED[0m[32m                                       [ 25%][0m
lab5/test_lab5.py::test_df2 [32mPASSED[0m[32m                                       [ 50%][0m
lab5/test_lab5.py::test_df3 [32mPASSED[0m[32m                                       [ 75%][0m
lab5/test_lab5.py::test_df_noise [31mFAILED[0m[31m                                  [100%][0m

[31m[1m________________________________ test_df_noise ________________________________[0m

    [0m[94mdef[39;49;00m [92mtest_df_noise[39;49;00m():[90m[39;49;00m
>       [94massert[39;49;00m load_and_test([33m"[39;49;00m[33mlab5/df_noise.csv[39;49;00m[33m"[39;49;00m) > [94m0.9[39;49;00