In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE

Линейная регрессия

In [3]:
class MyLinearRegression:
    def __init__(self, lr=0.01, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        n_samples, n_features = X.shape

        Xb = np.hstack([np.ones((n_samples, 1)), X])

        self.w = np.zeros(Xb.shape[1], dtype=float)

        for _ in range(self.n_iter):
            y_pred = Xb @ self.w
            grad = (1 / n_samples) * Xb.T @ (y_pred - y)
            self.w -= self.lr * grad

        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        Xb = np.hstack([np.ones((X.shape[0], 1)), X])
        return Xb @ self.w


Softmax регрессия

In [4]:
class SoftmaxRegression:
    def __init__(self, lr=0.1, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter

    def _softmax(self, z):
        z_stable = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_stable)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def _build_label_mapping(self, y):
        self.classes_ = np.unique(y)
        self._label_to_index = {lab: i for i, lab in enumerate(self.classes_)}

    def _one_hot(self, y):
        y = np.asarray(y)
        n_classes = len(self.classes_)
        y_oh = np.zeros((y.shape[0], n_classes))
        for i, lab in enumerate(y):
            idx = self._label_to_index[lab]
            y_oh[i, idx] = 1
        return y_oh

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples, n_features = X.shape

        self._build_label_mapping(y)
        n_classes = len(self.classes_)

        Xb = np.hstack([np.ones((n_samples, 1)), X])
        self.W = np.zeros((Xb.shape[1], n_classes))

        y_onehot = self._one_hot(y)

        for _ in range(self.n_iter):
            scores = Xb @ self.W
            probs = self._softmax(scores)
            grad = (1 / n_samples) * Xb.T @ (probs - y_onehot)
            self.W -= self.lr * grad

        return self

    def predict_proba(self, X):
        X = np.asarray(X)
        Xb = np.hstack([np.ones((X.shape[0], 1)), X])
        scores = Xb @ self.W
        return self._softmax(scores)

    def predict(self, X):
        probs = self.predict_proba(X)
        idxs = np.argmax(probs, axis=1)
        return self.classes_[idxs]

Загрузка датасета классификации

In [5]:
df_fraud = pd.read_csv('creditcard.csv')

df_fraud = df_fraud.sample(n=10000, random_state=42)

Xc = df_fraud.drop('Class', axis=1)
yc = df_fraud['Class']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

Запуск классификации sklearn

In [6]:
sk_log = LogisticRegression(max_iter=10000)
sk_log.fit(Xc_train, yc_train)
yc_pred_sk = sk_log.predict(Xc_test)

Запуск собственной классификации

In [7]:
my_log = SoftmaxRegression(lr=0.05, n_iter=10000)
my_log.fit(Xc_train, yc_train)
yc_pred_my = my_log.predict(Xc_test)

Метрики

In [8]:

print(f"Sklearn accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}, f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(yc_test, yc_pred_my):.4f}, f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

Sklearn accuracy=0.9980, f1=0.9980, recall=0.3333333333333333
Custom  accuracy=0.9985, f1=0.9978, recall=0.0


Регрессия

Загрузка датасета

In [9]:
df_crop = pd.read_csv('crop_yield.csv')

df_crop = df_crop.sample(n=10000, random_state=42)

Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=42
)


Запуск регресии sklearn

In [10]:
sk_lin = LinearRegression()
sk_lin.fit(Xr_train, yr_train)
yr_pred_sk = sk_lin.predict(Xr_test)

Запуск собственной регресии

In [11]:
my_lin = MyLinearRegression(lr=0.000001, n_iter=2000)
my_lin.fit(Xr_train, yr_train)
yr_pred_my = my_lin.predict(Xr_test)

Метрики

In [12]:

print(f"SkLearn RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}, R2={r2_score(yr_test, yr_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}, R2={r2_score(yr_test, yr_pred_my):.4f}")

SkLearn RMSE=0.4922, R2=0.9160
Custom  RMSE=1.1651, R2=0.5293


Теперь применим улучшения полученные на этапе анализа данных

Классификация

Уменьшение дисбаланса классов

In [13]:
sm = SMOTE(random_state=42)
Xc_train_bal, yc_train_bal = sm.fit_resample(Xc_train, yc_train)

Скейлинг

In [14]:
scaler = StandardScaler()
Xc_train_scaled = scaler.fit_transform(Xc_train_bal)
Xc_test_scaled = scaler.transform(Xc_test)

sklearn

In [15]:
sk_log = LogisticRegression(max_iter=10000)
sk_log.fit(Xc_train_scaled, yc_train_bal)
yc_pred_sk = sk_log.predict(Xc_test_scaled)

Оценка качества sklearn

In [16]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")


accuracy=0.9885
f1=0.9929
recall=0.6666666666666666


Собственная реализация

In [17]:
my_lin = SoftmaxRegression(lr=0.0001, n_iter=2000)
my_lin.fit(Xc_train_scaled, yc_train_bal)
yc_pred_my = my_lin.predict(Xc_test_scaled)

Оценка качества собственная реализация

In [18]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_my):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

accuracy=0.9975
f1=0.9981
recall=1.0


Регрессия

Скейлинг

In [19]:
scaler = StandardScaler()
Xr_train_scaled = scaler.fit_transform(Xr_train)
Xr_test_scaled = scaler.transform(Xr_test)

Расширение данных полиномами

In [20]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
Xr_train_poly = poly.fit_transform(Xr_train_scaled)
Xr_test_poly = poly.transform(Xr_test_scaled)

sklearn

In [21]:
sk_lin = LinearRegression()
sk_lin.fit(Xr_train_poly, yr_train)
yr_pred_sk = sk_lin.predict(Xr_test_poly)

Оценка качества sklearn

In [22]:
print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_sk):.4f}")

RMSE=0.4985
R2=0.9138


Собственная реализация

In [23]:
my_lin = MyLinearRegression(lr=0.001, n_iter=2000)
my_lin.fit(Xr_train_poly, yr_train)
yr_pred_my = my_lin.predict(Xr_test_poly)

Оценка результата

In [24]:

print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_my):.4f}")

RMSE=0.6515
R2=0.8528
