In [1]:
import functools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque
from ULogicalModels import UDecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
class URandomForestClassifier:
    def __init__(self,
                 n_estimators=10,
                 criterion='gini',
                 max_depth=1,
                 max_features='sqrt',
                 subset_size=0.7):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.subset_size = subset_size

    def fit(self, X, y):
        self.ensemble_ = []
        self.X_train, self.y_train = np.array(X), np.array(y)
        n, m = self.X_train.shape
        while len(self.ensemble_) < self.n_estimators:
            subset_indexes = np.random.choice(n, 
                                              size=int(abs(min(n*self.subset_size, n))), 
                                              replace=False)
            feature_indexes = np.random.choice(m, 
                                               size=int((np.sqrt(m) if self.max_features=='sqrt' else self.max_features)), 
                                               replace=False)
            sub_X_train = self.X_train[subset_indexes]
            sub_X_train = sub_X_train[:,feature_indexes]
            sub_y_train = self.y_train[subset_indexes]
            cur_tree = UDecisionTreeClassifier(self.criterion, self.max_depth)
            cur_tree.fit(sub_X_train, sub_y_train)
            self.ensemble_.append((cur_tree, feature_indexes))

    @staticmethod
    def _passEnsemble(cur_tree, X, feature_indexes):
        return cur_tree.predict(X[:,feature_indexes])
    
    def predict(self, X):
        X = np.array(X)
        trees_result = np.array([self._passEnsemble(cur_tree, X, feature_indexes)
                                for cur_tree, feature_indexes in self.ensemble_])
        object_result = []
        for tree_res in trees_result.T:
            freq = {}
            for res in tree_res:
                freq[res] = freq.setdefault(res, 0) + 1
            object_result.append(max(freq.items(), key=(lambda x: x[1]))[0])
        return np.array(object_result)

In [3]:
def getModelError(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return classification_report(y_test, y_predict)
    
def compareModels(model, uModel, X, y, is_need_scale):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
    if is_need_scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    model_error = getModelError(model, X_train, X_test, y_train, y_test)
    uModel_error = getModelError(uModel, X_train, X_test, y_train, y_test)
    print('Ошибка на пакетной модели\n', model_error)
    print('Ошибка на реализованной модели\n', uModel_error)

In [4]:
#Вины
data_1 = pd.read_csv('DATA/wine_fraud.csv')
X = data_1.drop(['type','quality'], axis=1)
y = data_1['type']

In [8]:
#Исследование слуха
data_1 = pd.read_csv('DATA/hearing_test.csv')
X = data_1.drop('test_result', axis=1)
y = data_1['test_result']

In [None]:
#Ирисы
data_1 = pd.read_csv('DATA/iris.csv')
X = data_1.drop('species', axis=1)
y = data_1['species']

In [5]:
model = RandomForestClassifier(10, max_depth=4)
uModel = URandomForestClassifier(10, max_depth=4)

In [6]:
compareModels(model, uModel, X, y, False)

Ошибка на пакетной модели
               precision    recall  f1-score   support

         red       1.00      0.95      0.97       499
       white       0.98      1.00      0.99      1451

    accuracy                           0.99      1950
   macro avg       0.99      0.98      0.98      1950
weighted avg       0.99      0.99      0.99      1950

Ошибка на реализованной модели
               precision    recall  f1-score   support

         red       0.99      0.92      0.95       499
       white       0.97      1.00      0.98      1451

    accuracy                           0.98      1950
   macro avg       0.98      0.96      0.97      1950
weighted avg       0.98      0.98      0.98      1950

