In [7]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score

In [12]:
# Загрузка данных для классификации
data = load_breast_cancer()
X, y = data.data, data.target

# Разделение данных на тренировочную и тестовую выборки для классификации
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Бейзлайн модель без оптимизации
knn_classifier_baseline = KNeighborsClassifier(n_neighbors=3)
knn_classifier_baseline.fit(X_train, y_train)

# Оценка бейзлайн модели
y_pred_baseline = knn_classifier_baseline.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
precision_baseline = precision_score(y_test, y_pred_baseline)
recall_baseline = recall_score(y_test, y_pred_baseline)
f1_baseline = f1_score(y_test, y_pred_baseline)

print(f"Classification Metrics:\nAccuracy: {accuracy_baseline}\nPrecision: {precision_baseline}\nRecall: {recall_baseline}\nF1 Score: {f1_baseline}")

# Загрузка данных для регрессии
boston = fetch_california_housing()
X_boston, y_boston = boston.data, boston.target

# Разделение данных на тренировочную и тестовую выборки для регрессии
X_train_boston, X_test_boston, y_train_boston, y_test_boston = train_test_split(X_boston, y_boston, test_size=0.2, random_state=42)

# Стандартизация данных для регрессии
scaler = StandardScaler()
X_train_boston = scaler.fit_transform(X_train_boston)
X_test_boston = scaler.transform(X_test_boston)

# Создание и обучение модели KNN для регрессии
knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(X_train_boston, y_train_boston)

# Оценка модели для регрессии
y_pred_boston = knn_regressor.predict(X_test_boston)
mae = mean_absolute_error(y_test_boston, y_pred_boston)
mse = mean_squared_error(y_test_boston, y_pred_boston)
r2 = r2_score(y_test_boston, y_pred_boston)

print(f"\nRegression Metrics:\nMAE: {mae}\nMSE: {mse}\nR-squared: {r2}")

Classification Metrics:
Accuracy: 0.9298245614035088
Precision: 0.9315068493150684
Recall: 0.9577464788732394
F1 Score: 0.9444444444444444

Regression Metrics:
MAE: 0.4599198611111111
MSE: 0.4666634350517549
R-squared: 0.6438795499720962


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

In [11]:
## Улучшенная модель с оптимизацией
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # Манхэттенское или Евклидово расстояние
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Обучаем улучшенную модель
best_classifier = grid_search.best_estimator_

# Оценка улучшенной модели
y_pred_improved = best_classifier.predict(X_test)
accuracy_improved = accuracy_score(y_test, y_pred_improved)
precision_improved = precision_score(y_test, y_pred_improved)
recall_improved = recall_score(y_test, y_pred_improved)
f1_improved = f1_score(y_test, y_pred_improved)

print(f"Improved Classification Metrics:\nAccuracy: {accuracy_improved}\nPrecision: {precision_improved}\nRecall: {recall_improved}\nF1 Score: {f1_improved}")

# Загрузка данных для регрессии
boston = fetch_california_housing()
X_boston, y_boston = boston.data, boston.target

# Разделение данных на тренировочную и тестовую выборки
X_train_boston, X_test_boston, y_train_boston, y_test_boston = train_test_split(X_boston, y_boston, test_size=0.2, random_state=42)

# Пайплайн для регрессии
pipeline_boston = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('knn', KNeighborsRegressor())
])

# Подбор гиперпараметров для регрессии
param_grid_reg = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # Манхэттенское или Евклидово расстояние
}

grid_search_boston = GridSearchCV(pipeline_boston, param_grid_reg, cv=5, scoring='neg_mean_squared_error')
grid_search_boston.fit(X_train_boston, y_train_boston)

# Обучаем модель, используя лучшие гиперпараметры
best_regressor = grid_search_boston.best_estimator_

# Оценка модели для регрессии
y_pred_boston = best_regressor.predict(X_test_boston)
mae = mean_absolute_error(y_test_boston, y_pred_boston)
mse = mean_squared_error(y_test_boston, y_pred_boston)
r2 = r2_score(y_test_boston, y_pred_boston)

print(f"\nImproved Regression Metrics:\nMAE: {mae}\nMSE: {mse}\nR-squared: {r2}")

Improved Classification Metrics:
Accuracy: 0.9649122807017544
Precision: 0.971830985915493
Recall: 0.971830985915493
F1 Score: 0.971830985915493

Improved Regression Metrics:
MAE: 0.41595939485911765
MSE: 0.38251670809373284
R-squared: 0.7080936452318685


In [14]:
from collections import Counter


class CustomKNNClassifier:
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Вычисление расстояний от x до всех точек в обучающем наборе
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
        # Получение k ближайших точек
        k_indices = np.argsort(distances)[:self.n_neighbors]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Определение наиболее встречающегося класса
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [15]:
# Загрузка данных и разделение на обучающий и тестовый наборы
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение пользовательского KNN-классификатора
knn = CustomKNNClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Custom KNN Classification Metrics:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}")

Custom KNN Classification Metrics:
Accuracy: 0.9298245614035088
Precision: 0.9315068493150684
Recall: 0.9577464788732394
F1 Score: 0.9444444444444444


In [16]:
class CustomLinearRegression:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Добавляем столбец единиц к X
        theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
        self.intercept_ = theta_best[0]
        self.coef_ = theta_best[1:]

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Добавляем столбец единиц к X
        return X_b.dot(np.r_[self.intercept_, self.coef_])

In [17]:
# Использу fetch_california_housing()
data_reg = fetch_california_housing()
X_reg, y_reg = data_reg.data, data_reg.target
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Обучение пользовательской линейной регрессии
lin_reg = CustomLinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
y_pred_reg = lin_reg.predict(X_test_reg)

# Оценка
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Custom Linear Regression Metrics:\nMAE: {mae}\nMSE: {mse}\nR-squared: {r2}")

Custom Linear Regression Metrics:
MAE: 0.5332001304966606
MSE: 0.5558915986946272
R-squared: 0.5757877060329217


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

class ImprovedCustomKNNClassifier(CustomKNNClassifier):
    def fit(self, X, y):
        # Стандартизация данных
        self.scaler = StandardScaler().fit(X)
        X_scaled = self.scaler.transform(X)
        super().fit(X_scaled, y)

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return super().predict(X_scaled)

# Используем улучшенную версию
improved_knn = ImprovedCustomKNNClassifier(n_neighbors=3)
improved_knn.fit(X_train, y_train)
y_pred_improved = improved_knn.predict(X_test)

# Оценка
accuracy_improved = accuracy_score(y_test, y_pred_improved)
precision_improved = precision_score(y_test, y_pred_improved)
recall_improved = recall_score(y_test, y_pred_improved)
f1_improved = f1_score(y_test, y_pred_improved)

print(f"Improved Custom KNN Classification Metrics:\nAccuracy: {accuracy_improved}\nPrecision: {precision_improved}\nRecall: {recall_improved}\nF1 Score: {f1_improved}")

Improved Custom KNN Classification Metrics:
Accuracy: 0.9473684210526315
Precision: 0.9577464788732394
Recall: 0.9577464788732394
F1 Score: 0.9577464788732394


In [19]:
class ImprovedCustomLinearRegression(CustomLinearRegression):
    def fit(self, X, y):
        # Стандартизация данных
        self.scaler = StandardScaler().fit(X)
        X_scaled = self.scaler.transform(X)
        super().fit(X_scaled, y)

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return super().predict(X_scaled)

# Используем улучшенную версию
improved_lin_reg = ImprovedCustomLinearRegression()
improved_lin_reg.fit(X_train_reg, y_train_reg)
y_pred_reg_improved = improved_lin_reg.predict(X_test_reg)

# Оценка
mae_improved = mean_absolute_error(y_test_reg, y_pred_reg_improved)
mse_improved = mean_squared_error(y_test_reg, y_pred_reg_improved)
r2_improved = r2_score(y_test_reg, y_pred_reg_improved)

print(f"Improved Custom Linear Regression Metrics:\nMAE: {mae_improved}\nMSE: {mse_improved}\nR-squared: {r2_improved}")

Improved Custom Linear Regression Metrics:
MAE: 0.5332001304956562
MSE: 0.5558915986952448
R-squared: 0.5757877060324503
