In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.spatial import KDTree

KNN классификатор

In [17]:
class KNNClassifier:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
        self.tree = KDTree(self.X_train)
        return self

    def predict(self, X):
        X = np.asarray(X)
        dists, idxs = self.tree.query(X, k=self.n_neighbors)

        preds = []
        for idx in idxs:
            labels = self.y_train[idx]
            vals, cnts = np.unique(labels, return_counts=True)
            preds.append(vals[np.argmax(cnts)])

        return np.array(preds)

KNN регрессор

In [18]:
class KNNRegressor:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
        self.tree = KDTree(self.X_train)
        return self

    def predict(self, X):
        X = np.asarray(X)
        dists, idxs = self.tree.query(X, k=self.n_neighbors)
        return np.mean(self.y_train[idxs], axis=1)

Загрузка датасета классификации

In [19]:
df_fraud = pd.read_csv('creditcard.csv')

Xc = df_fraud.drop('Class', axis=1)
yc = df_fraud['Class']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

Запуск классификации sklearn

In [20]:
k = 5
sk_clf = KNeighborsClassifier(n_neighbors=k)
sk_clf.fit(Xc_train, yc_train)
yc_pred_sk = sk_clf.predict(Xc_test)

Запуск собственной классификации

In [21]:
my_clf = KNNClassifier(n_neighbors=k)
my_clf.fit(Xc_train, yc_train)
yc_pred_my = my_clf.predict(Xc_test)

Метрики

In [22]:

print(f"Sklearn accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}, f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(yc_test, yc_pred_my):.4f}, f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

Sklearn accuracy=0.9983, f1=0.9975, recall=0.030612244897959183
Custom  accuracy=0.9983, f1=0.9975, recall=0.030612244897959183


Регрессия

Загрузка датасета

In [23]:
df_crop = pd.read_csv('crop_yield.csv')

Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=42
)


Запуск регресии sklearn

In [24]:
sk_reg = KNeighborsRegressor(n_neighbors=k)
sk_reg.fit(Xr_train, yr_train)
yr_pred_sk = sk_reg.predict(Xr_test)

Запуск собственной регресии

In [25]:
my_reg = KNNRegressor(n_neighbors=k)
my_reg.fit(Xr_train, yr_train)
yr_pred_my = my_reg.predict(Xr_test)

Метрики

In [26]:

print(f"SkLearn RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}, R2={r2_score(yr_test, yr_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}, R2={r2_score(yr_test, yr_pred_my):.4f}")

SkLearn RMSE=0.9842, R2=0.6640
Custom  RMSE=0.9842, R2=0.6640


Теперь применим улучшения полученные на этапе анализа данных

Классификация

Скейлинг

In [27]:
scaler = StandardScaler()
Xc_train_scaled = scaler.fit_transform(Xc_train)
Xc_test_scaled = scaler.transform(Xc_test)

PCA для снижения размерности

In [28]:
pca = PCA(n_components=0.95)
Xc_train_pca = pca.fit_transform(Xc_train_scaled)
Xc_test_pca = pca.transform(Xc_test_scaled)

sklearn

In [29]:
k = 5
sk_clf = KNeighborsClassifier(n_neighbors=k)
sk_clf.fit(Xc_train_pca, yc_train)
yc_pred_sk = sk_clf.predict(Xc_test_pca)

Оценка качества sklearn

In [30]:

print(f"accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")


accuracy=0.9995
f1=0.9995
recall=0.7857142857142857


Собственная реализация

In [33]:
k = 5
my_clf = KNNClassifier(n_neighbors=k)
my_clf.fit(Xc_train_pca, yc_train)
yc_pred_my = my_clf.predict(Xc_test_pca)

Оценка качества собственная реализация

In [34]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_my):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")

accuracy=0.9995
f1=0.9995
recall=0.7857142857142857


Регрессия

In [35]:
Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Разделим признаки по типам

In [36]:
num_cols = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']
cat_cols = ['Region', 'Soil_Type', 'Crop',
            'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition']

Скейлинг и one hot

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)
Xr_processed = preprocessor.fit_transform(Xr)

Разделение на выборки

In [38]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr_processed, yr, test_size=0.2, random_state=42
)

PCA

In [39]:
pca = PCA(n_components=0.95)
Xr_train_pca = pca.fit_transform(Xr_train)
Xr_test_pca = pca.transform(Xr_test)

skLearn

In [40]:
k = 5
sk_r = KNeighborsRegressor(n_neighbors=k)
sk_r.fit(Xr_train_pca, yr_train)
yc_pred_r = sk_r.predict(Xr_test_pca)

Оценка результата

In [41]:

print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yc_pred_r)):.4f}")
print(f"R2={r2_score(yr_test, yc_pred_r):.4f}")

RMSE=0.5727
R2=0.8862


Собственная реализация

In [42]:
k = 5
my_r = KNNRegressor(n_neighbors=k)
my_r.fit(Xr_train_pca, yr_train)
yr_pred_my = my_r.predict(Xr_test_pca)

Оценка результата

In [43]:
print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_my):.4f}")

RMSE=0.5727
R2=0.8862
