# Лабораторная работа №2

Подключим библиотеки:

In [19]:
import pandas as pd

import numpy as np
from numpy import log, dot, e
from numpy.random import rand

from collections import Counter

from sklearn import metrics  
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Подключим данные (о развити гипогликемии у группы лиц):

In [20]:
dataframe = pd.read_csv('/content/sample_data/Hypoglycemia.csv', header = None)
dataframe.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Pedigree','Age','Outcome']


Выведем их:

In [21]:
print(dataframe.head())
print("Size of data:",dataframe.shape)
print("Structure:\n",dataframe.dtypes) 

   Pregnancies  Glucose  BloodPressure  ...  Pedigree  Age  Outcome
0            6      148             72  ...     0.627   50        1
1            1       85             66  ...     0.351   31        0
2            8      183             64  ...     0.672   32        1
3            1       89             66  ...     0.167   21        0
4            0      137             40  ...     2.288   33        1

[5 rows x 9 columns]
Size of data: (768, 9)
Structure:
 Pregnancies        int64
Glucose            int64
BloodPressure      int64
SkinThickness      int64
Insulin            int64
BMI              float64
Pedigree         float64
Age                int64
Outcome            int64
dtype: object


Переведём в удобный формат:

In [22]:
x = dataframe.iloc[:, :-1].values
y = dataframe.iloc[:, 8].values
trainingX, testX, trainingY, testY = train_test_split(x, y, test_size = 0.2,  random_state = 0)
transform = StandardScaler()
trainingX = transform.fit_transform(trainingX)
testX = transform.fit_transform(testX)

## Логистическая регрессия

In [23]:
class LogisticRegressionAlgorithm:
    def sigmoid(self, x): return 1 / (1 + e**(-x))
    
    def cost_function(self, x, y, weights):                 
        z = dot(x, weights)
        predict1 = y * log(self.sigmoid(z))
        predict0 = (1 - y) * log(1 - self.sigmoid(z))
        return -sum(predict1 + predict0) / len(x)
    
    def fit(self, x, y, epochs=25, lr=0.05):        
        loss = []
        weights = rand(x.shape[1])
        N = len(x)
                 
        for _ in range(epochs):        
            hatY = self.sigmoid(dot(x, weights))
            weights -= lr * dot(x.T,  hatY - y) / N            
            loss.append(self.cost_function(x, y, weights)) 
            
        self.weights = weights
        self.loss = loss
    
    def predict(self, x):        
        z = dot(x, self.weights)
        return [1 if i > 0.5 else 0 for i in self.sigmoid(z)]

In [24]:
params = [(0.1, 50), (0.1, 100), (0.1, 1000), (0.1, 10000),
              (0.01, 50), (0.01, 100), (0.01, 1000), (0.01, 10000),
              (0.001, 50), (0.001, 100), (0.001, 1000), (0.001, 10000),
              (0.0001, 50), (0.0001, 100), (0.0001, 1000), (0.0001, 10000),
              (0.00001, 50), (0.00001, 100), (0.00001, 1000), (0.00001, 10000)
]

trainMax = -1
testMax = -1
learningRateMax = 0
epochsMax = 0

for i in params:
    learningRate, epochs = i
    regression = LogisticRegressionAlgorithm()
    regression.fit(trainingX, trainingY, epochs, learningRate)
    lastTestY = regression.predict(testX)
    trainingLastY = regression.predict(trainingX)
    accuracyTraining = accuracy_score(trainingY, trainingLastY)
    accuracyTest = accuracy_score(testY, lastTestY)
    if (trainMax < accuracyTraining and testMax < accuracyTest):
      trainMax = accuracyTraining
      testMax = accuracyTest
      learningRateMax = learningRate
      epochsMax = epochs
print("Learning rate:", learningRate, "epochs:", epochs, "\n", "Max training accuracy:", trainMax, "Max test accuracy:", testMax)

Learning rate: 1e-05 epochs: 10000 
 Max training accuracy: 0.7345276872964169 Max test accuracy: 0.7402597402597403


In [25]:
regression = LogisticRegressionAlgorithm()
regression.fit(trainingX, trainingY, epochs = 10000, lr = 0.00001)
lastY = regression.predict(testX)

Рассмотрим метрики классификатора и матрицу ошибок:

In [26]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.6428571428571429
precision: 0.43548387096774194
recall: 0.574468085106383
f1_score 0.4954128440366973
confusion matrix:
 [[72 35]
 [20 27]]


## Дерево решений

In [27]:
class Node:
    def __init__(self, feature = None, threshold = None, data_left = None, data_right = None, gain = None, value = None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value

class DecisionTreeAlgorithm:
    def __init__(self, min_samples_split  = 2, max_depth = 5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None
        
    @staticmethod
    def _entropy(s):
        counts = np.bincount(np.array(s, dtype = np.int64))
        percentages = counts / len(s)
        entropy = 0
        for pct in percentages:
            if pct > 0:
                entropy += pct * np.log2(pct)
        return -entropy
    
    def _information_gain(self, parent, left_child, right_child):
        num_left = len(left_child) / len(parent)
        num_right = len(right_child) / len(parent)
        return self._entropy(parent) - (num_left * self._entropy(left_child) + num_right * self._entropy(right_child))
  
    def _best_split(self, X, y):
        best_split = {}
        best_info_gain = -1
        n_rows, n_cols = X.shape
 
        for f_idx in range(n_cols):
            X_curr = X[:, f_idx]

            for threshold in np.unique(X_curr):

                df = np.concatenate((X, y.reshape(1, -1).T), axis = 1)
                df_left = np.array([row for row in df if row[f_idx] <= threshold])
                df_right = np.array([row for row in df if row[f_idx] > threshold])

                if len(df_left) > 0 and len(df_right) > 0:
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]
                    gain = self._information_gain(y, y_left, y_right)
                    if gain > best_info_gain:
                        best_split = {
                            'feature_index': f_idx,
                            'threshold': threshold,
                            'df_left': df_left,
                            'df_right': df_right,
                            'gain': gain
                        }
                        best_info_gain = gain
        return best_split
    
    def _build(self, X, y, depth = 0):
        n_rows, n_cols = X.shape
        
        if n_rows >= self.min_samples_split and depth <= self.max_depth:
            best = self._best_split(X, y)
            if best['gain'] > 0:
                left = self._build(
                    X = best['df_left'][:, :-1], 
                    y = best['df_left'][:, -1], 
                    depth = depth + 1
                )
                right = self._build(
                    X = best['df_right'][:, :-1], 
                    y = best['df_right'][:, -1], 
                    depth = depth + 1
                )
                return Node(
                    feature = best['feature_index'], 
                    threshold = best['threshold'], 
                    data_left = left, 
                    data_right = right, 
                    gain = best['gain']
                )
        return Node(
            value = Counter(y).most_common(1)[0][0]
        )
    
    def fit(self, X, y):
        self.root = self._build(X, y)
        
    def _predict(self, x, tree):
        if tree.value != None:
            return tree.value
        feature_value = x[tree.feature]

        if feature_value <= tree.threshold:
            return self._predict(x = x, tree = tree.data_left)
        
        if feature_value > tree.threshold:
            return self._predict(x = x, tree = tree.data_right)
        
    def predict(self, X):
        return [self._predict(x, self.root) for x in X]

In [28]:
params = [(2, 1), (2, 2), (2, 3), (2, 10), (3, 15), 
              (3, 1), (3, 2), (3, 3), (3, 10), (3, 15), 
              (4, 1), (4, 2), (4, 3), (4, 10), (4, 15),
              (5, 1), (5, 2), (5, 3), (5, 10), (5, 15),
              (10, 1), (10, 2), (10, 3), (10, 10), (10, 15)
              ]

trainMax = -1
testMax = -1
sampleSplitMax = 0
depthMax = 0

for i in params:
    sampleSplit, depth = i
    tree = DecisionTreeAlgorithm(min_samples_split = sampleSplit, max_depth = depth)
    tree.fit(trainingX, trainingY)
    lastTestY = tree.predict(testX)
    trainingLastY = tree.predict(trainingX)
    accuracyTraining = accuracy_score(trainingY, trainingLastY)
    accuracyTest = accuracy_score(testY, lastTestY)
    if (trainMax < accuracyTraining and testMax < accuracyTest):
      trainMax = accuracyTraining
      testMax = accuracyTest
      sampleSplitMax = sampleSplit
      depthMax = depth 
print("Minimum sample splits:", sampleSplitMax, "Max depth:", depthMax, "\n", "Max training accuracy:", trainMax, "Max test accuracy:", testMax)

Minimum sample splits: 2 Max depth: 1 
 Max training accuracy: 0.7638436482084691 Max test accuracy: 0.7597402597402597


In [29]:
tree = DecisionTreeAlgorithm(min_samples_split = 2, max_depth = 1)
tree.fit(trainingX, trainingY)
lastY = tree.predict(testX)

Рассмотрим метрики классификатора и матрицу ошибок:

In [30]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.7597402597402597
precision: 0.625
recall: 0.5319148936170213
f1_score 0.5747126436781609
confusion matrix:
 [[92 15]
 [22 25]]


## Random Forest

In [31]:
class RandomForestAlgorithm:
    def __init__(self, num_trees=25, min_samples_split=2, max_depth=5):
        self.num_trees = num_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.decision_trees = []
        
    @staticmethod
    def _sample(X, y):
        n_rows, n_cols = X.shape
        samples = np.random.choice(a=n_rows, size=n_rows, replace=True)
        return X[samples], y[samples]
        
    def fit(self, X, y):
        if len(self.decision_trees) > 0:
            self.decision_trees = []
        num_built = 0
        while num_built < self.num_trees:
            try:
                clf =DecisionTreeAlgorithm(
                    min_samples_split=self.min_samples_split,
                    max_depth=self.max_depth
                )
                _X, _y = self._sample(X, y)
                clf.fit(_X, _y)
                self.decision_trees.append(clf)
                num_built += 1
            except Exception as e:
                continue
    
    def predict(self, X):
        y = []
        for tree in self.decision_trees:
            y.append(tree.predict(X))
        y = np.swapaxes(a=y, axis1=0, axis2=1)
        predictions = []
        for preds in y:
            counter = Counter(preds)
            predictions.append(counter.most_common(1)[0][0])
        return predictions

In [32]:
params = [(2, 2, 1), (5, 2, 1), (10, 2, 1), (20, 2, 1), (25, 2, 1)]

trainMax = -1
testMax = -1
sampleSplitMax = 0
depthMax = 0
treeCountMax = 0

for i in params:
    treeCount, sampleSplit, depth = i
    forest = RandomForestAlgorithm(num_trees = treeCount, min_samples_split = sampleSplit, max_depth = depth)
    forest.fit(trainingX, trainingY)
    lastTestY = forest.predict(testX)
    trainingLastY = forest.predict(trainingX)
    accuracyTraining = accuracy_score(trainingY, trainingLastY)
    accuracyTest = accuracy_score(testY, lastTestY)
    if (trainMax < accuracyTraining and testMax < accuracyTest):
      trainMax = accuracyTraining
      testMax = accuracyTest
      sampleSplitMax = sampleSplit
      depthMax = depth
      treeCountMax = treeCount
print("Tree count:", treeCountMax, "Minimum sample splits:", sampleSplitMax, "Max depth:", depthMax, "\n", "Max training accuracy:", trainMax, "Max test accuracy:", testMax)

Tree count: 20 Minimum sample splits: 2 Max depth: 1 
 Max training accuracy: 0.7654723127035831 Max test accuracy: 0.7792207792207793


In [33]:
forest = RandomForestAlgorithm(num_trees = 10, min_samples_split = 2, max_depth = 1)
forest.fit(trainingX, trainingY)
lastY = tree.predict(testX)

Рассмотрим метрики классификатора и матрицу ошибок:

In [34]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.7597402597402597
precision: 0.625
recall: 0.5319148936170213
f1_score 0.5747126436781609
confusion matrix:
 [[92 15]
 [22 25]]


## Сравнение с реализацией sklearn

### Логистическая регрессия

In [35]:
regressionSK = LogisticRegression().fit(trainingX, trainingY)
lastY = regressionSK.predict(testX)

Метрики классификатора и матрица ошибок:

In [36]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.7987012987012987
precision: 0.6904761904761905
recall: 0.6170212765957447
f1_score 0.651685393258427
confusion matrix:
 [[94 13]
 [18 29]]


Эта модель достигает большей точности, чем моя, так как в ней оптимизирован параметр f1_score.

### Дерево решений

In [37]:
treeSK = DecisionTreeClassifier()
treeSK.fit(trainingX, trainingY)
lastY = treeSK.predict(testX)

Метрики классификатора и матрица ошибок:

In [38]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.7012987012987013
precision: 0.5087719298245614
recall: 0.6170212765957447
f1_score 0.5576923076923077
confusion matrix:
 [[79 28]
 [18 29]]


Моя модель достигла большей точности, но меньшей полноты.

### Random Forest

In [39]:
forestSK = RandomForestClassifier()
forestSK.fit(trainingX, trainingY)
lastY = forestSK.predict(testX)

Метрики классификатора и матрица ошибок:

In [40]:
print("accuracy:",accuracy_score(testY, lastY))
print("precision:",precision_score(testY, lastY))
print("recall:",recall_score(testY, lastY))
print("f1_score", f1_score(testY, lastY))
print("confusion matrix:\n", confusion_matrix(testY, lastY))

accuracy: 0.8051948051948052
precision: 0.6666666666666666
recall: 0.723404255319149
f1_score 0.6938775510204082
confusion matrix:
 [[90 17]
 [13 34]]


Здесь модель sklearn оказалась сильно лучше по всем метрикам, кроме вероятностной точности.