In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
import category_encoders as ce
from sklearn.feature_selection import VarianceThreshold

Структура ноды

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

Классификатор

In [3]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        return 1.0 - np.sum(probs ** 2)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_feature, best_thresh = None, None
        best_gain = 0
        parent_impurity = self._gini(y)

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = X[:, feature] > t

                if np.sum(left_mask) < self.min_samples_split or np.sum(right_mask) < self.min_samples_split:
                    continue

                left_y, right_y = y[left_mask], y[right_mask]
                impurity = (len(left_y) * self._gini(left_y) + len(right_y) * self._gini(right_y)) / n_samples
                gain = parent_impurity - impurity

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_thresh = t

        return best_feature, best_thresh

    def _build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            values, counts = np.unique(y, return_counts=True)
            return Node(value=values[np.argmax(counts)])

        feature, threshold = self._best_split(X, y)
        if feature is None:
            values, counts = np.unique(y, return_counts=True)
            return Node(value=values[np.argmax(counts)])

        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return Node(feature, threshold, left, right)

    def fit(self, X, y):
        self.tree_ = self._build_tree(np.array(X), np.array(y))
        return self

    def _predict_one(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        else:
            return self._predict_one(x, node.right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree_) for x in np.array(X)])

Регрессор

In [4]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def _mse(self, y):
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_feature, best_thresh = None, None
        best_mse = np.inf

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = X[:, feature] > t

                if np.sum(left_mask) < self.min_samples_split or np.sum(right_mask) < self.min_samples_split:
                    continue

                left_y, right_y = y[left_mask], y[right_mask]
                mse = (len(left_y) * self._mse(left_y) + len(right_y) * self._mse(right_y)) / n_samples

                if mse < best_mse:
                    best_mse = mse
                    best_feature = feature
                    best_thresh = t

        return best_feature, best_thresh

    def _build_tree(self, X, y, depth=0):
        if (self.max_depth and depth >= self.max_depth) or len(y) < self.min_samples_split:
            return Node(value=np.mean(y))

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return Node(value=np.mean(y))

        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return Node(feature, threshold, left, right)

    def fit(self, X, y):
        self.tree_ = self._build_tree(np.array(X), np.array(y))
        return self

    def _predict_one(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        else:
            return self._predict_one(x, node.right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree_) for x in np.array(X)])

Классификатор случайный лес

In [5]:
class RandomForestClassifierCustom:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(np.array(X), np.array(y))
            tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        y_pred = []
        for i in range(X.shape[0]):
            counts = Counter(tree_preds[:, i])
            y_pred.append(counts.most_common(1)[0][0])
        return np.array(y_pred)

Регрессор случайный лес

In [6]:
class RandomForestRegressorCustom:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(np.array(X), np.array(y))
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

Датасет классификации

In [7]:
df_fraud = pd.read_csv('creditcard.csv')

df_fraud = df_fraud.sample(n=1000, random_state=2352)

Xc = df_fraud.drop('Class', axis=1)
yc = df_fraud['Class']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

sklearn

In [8]:
sk_forest_c = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
sk_forest_c.fit(Xc_train, yc_train)
yc_pred_sk = sk_forest_c.predict(Xc_test)

Собственная реализация

In [9]:
my_forest_c = RandomForestClassifierCustom(n_estimators=10, max_depth=3)
my_forest_c.fit(Xc_train, yc_train)
yc_pred_my = my_forest_c.predict(Xc_test)

Метрики

In [None]:

print(f"Sklearn accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}, f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(yc_test, yc_pred_my):.4f}, f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

Датасет регресии

In [11]:
df_crop = pd.read_csv('crop_yield.csv')

df_crop = df_crop.sample(n = 10000,random_state=42)

Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=42
)


sklearn

In [12]:
sk_forest_r = RandomForestRegressor(n_estimators=10, max_depth=4, random_state=42)
sk_forest_r.fit(Xr_train, yr_train)
yr_pred_sk = sk_forest_r.predict(Xr_test)

собственная реализация

In [13]:
my_forest_r = RandomForestRegressorCustom(n_estimators=10, max_depth=4)
my_forest_r.fit(Xr_train, yr_train)
yr_pred_my= my_forest_r.predict(Xr_test)

Метрики

In [14]:

print(f"SkLearn RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}, R2={r2_score(yr_test, yr_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}, R2={r2_score(yr_test, yr_pred_my):.4f}")

SkLearn RMSE=0.5753, R2=0.8852
Custom  RMSE=0.5757, R2=0.8851


Теперь применим улучшения полученные на этапе анализа данных

Классификация

In [8]:
df_fraud = pd.read_csv('creditcard.csv')

Xc = df_fraud.drop('Class', axis=1)
yc = df_fraud['Class']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

Балансировка классов

In [9]:
sm = SMOTE(random_state=42)
Xc_train_bal, yc_train_bal = sm.fit_resample(Xc_train, yc_train)

idx = np.random.choice(len(Xc_train_bal), size=1000, replace=False)

Xc_train_bal_small = Xc_train_bal.iloc[idx]
yc_train_bal_small = yc_train_bal.iloc[idx]

sklearn

In [10]:
sk_forest_c = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
sk_forest_c.fit(Xc_train_bal_small, yc_train_bal_small)
yc_pred_sk = sk_forest_c.predict(Xc_test)

Метрики

In [11]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")


accuracy=0.9895
f1=0.9934
recall=0.8775510204081632


Собственная реализация

In [19]:
my_forest_c = RandomForestClassifierCustom(n_estimators=10, max_depth=3)
my_forest_c.fit(Xc_train_bal_small, yc_train_bal_small)
yc_pred_my = my_forest_c.predict(Xc_test)

Метрики

In [20]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_my):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

accuracy=0.9829
f1=0.9899
recall=0.8877551020408163


Регрессия

Считываем датасет

In [None]:
df_crop = pd.read_csv('crop_yield.csv')
df_crop = df_crop.sample(n=10000, random_state=42)

Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Работа с призаками

In [None]:
cat_cols = Xr.select_dtypes(include=['object']).columns.tolist()

encoder = ce.TargetEncoder(cols=cat_cols)
Xr_encoded = encoder.fit_transform(Xr, yr)
Xr_encoded["Temp_Rain"] = Xr_encoded["Temperature_Celsius"] * Xr_encoded["Rainfall_mm"]
Xr_encoded["Rain_per_day"] = Xr_encoded["Rainfall_mm"] / (Xr_encoded["Days_to_Harvest"] + 1)

Фильтрация данных

In [None]:
selector = VarianceThreshold(threshold=0.01)
Xr_filtered = selector.fit_transform(Xr_encoded)
cols_after_vt = Xr_encoded.columns[selector.get_support()]

Xr_filtered = Xr_encoded[cols_after_vt]

corr = Xr_filtered.corrwith(yr).abs()
weak_features = corr[corr < 0.02].index

Xr_filtered = Xr_filtered.drop(columns=weak_features)

corr_matrix = Xr_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
Xr_filtered = Xr_filtered.drop(columns=to_drop)

In [22]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr_filtered, yr, test_size=0.2, random_state=42
)

Подбор гипер параметров

In [23]:
param_grid = {
    'n_estimators': [10],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1
)

grid.fit(Xr_train, yr_train)
best_rf = grid.best_estimator_
yr_pred_best = best_rf.predict(Xr_test)

Метрики

In [24]:
print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_best)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_best):.4f}")

RMSE=0.5289
R2=0.9030


Собственная реализация

In [25]:
my_forest_r = RandomForestRegressorCustom(n_estimators=10, max_depth=4)
my_forest_r.fit(Xr_train, yr_train)
yr_pred_my= my_forest_r.predict(Xr_test)

Метрики

In [26]:
print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_my):.4f}")

RMSE=0.5750
R2=0.8854
