# Машинное обучение. Лабораторная работа №2

### Эссаулов Андрей. М80-407Б-18 (307Б)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('winequality-red.csv')

## Настроим набор данных

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_random_state
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [4]:
design_matrix, target = data.drop(columns=['quality']).to_numpy(dtype=np.float64), np.array(data['quality'], dtype=np.float64)

In [5]:
print(data[data['quality'] > 5].shape[0] / data['quality'].shape[0] * 100)
print(data[data['quality'] <= 5].shape[0] / data['quality'].shape[0] * 100)

53.47091932457786
46.52908067542214


In [6]:
for index in range(len(target)):
    if target[index] > 5:
        target[index] = 1
    else:
        target[index] = 0

In [7]:
cnt_signs = design_matrix.shape[1]
max_values = np.zeros(cnt_signs)
for i in range(cnt_signs):
    max_in_clmn = design_matrix[:, i].max()
    max_values[i] = max_in_clmn
    design_matrix[:, i] /= max_in_clmn

In [8]:
features_train, features_test, target_train, target_test = train_test_split(design_matrix, target, test_size=0.2, random_state=24)

## Логистическая регрессия

In [9]:
class LogReg:
    
    def __init__(self, step=1e-1, it_count=10000):
        self.step = step
        self.it_count = it_count

    def __add_x0(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.hstack((intercept, X))

    def get_coeff(self, features, target):
        cnt = 0
        for i in range(target.shape[0]):
            if self.predict(features[i]) == target[i]:
                cnt += 1
        return cnt / target.shape[0]
    
    def f(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, x, y):
        X = self.__add_x0(copy.deepcopy(x))
        self.__weights = np.zeros(X.shape[1])

        for i in range(self.it_count):
            z = np.dot(X, self.__weights)
            grad = np.dot(X.T, self.f(z) - y) / y.size
            self.__weights -= self.step * grad

    def predict(self, x):
        X = self.__add_x0(np.array([copy.deepcopy(x)]))
        return self.f(np.dot(X, self.__weights)).round()

In [10]:
import copy

In [11]:
mlg = LogReg()
mlg.fit(features_train, target_train)

In [12]:
print('My logic regression: {}'.format(mlg.get_coeff(features_test, target_test) * 100))

My logic regression: 72.1875


In [13]:
sklg = LogisticRegression()
sklg.fit(features_train, target_train)

print('sklearn logic regression: {}'.format(sklg.score(features_test, target_test) * 100))

sklearn logic regression: 72.1875


## Дерево решений

In [14]:
class DesTree:
    def __init__(self, max_depth=10, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    def fit(self, f, y):
        n_samples, self.n_features_ = f.shape
        self.min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf)
        f = np.hstack([f, np.array([[k] for k in y])])
        self.root = self.__split(f)
        self.split(self.root, 1)

    def __split(self, dataset):
        class_values = list(set(row[-1] for row in dataset))
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        for index in range(len(dataset[0]) - 1):
            for row in dataset:
                groups = self.test_split(index, row[index], dataset)
                GINI = self.__GINI(groups, class_values)
                if GINI < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], GINI, groups
        return {'index': b_index, 'value': b_value, 'groups': b_groups}

    def __GINI(self, groups, classes):
        n_instances = float(sum([len(group) for group in groups]))
        GINI = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in classes:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            GINI += (1.0 - score) * (size / n_instances)
        return GINI

    def test_split(self, index, value, dataset):
        l, r = list(), list()
        for row in dataset:
            if row[index] < value:
                l.append(row)
            else:
                r.append(row)
        return l, r

    def __terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)

    def split(self, node, depth):
        l, r = node['groups']
        del (node['groups'])
        if not l or not r:
            node['l'] = node['r'] = self.__terminal(l + r)
            return
        if depth >= self.max_depth:
            node['l'], node['r'] = self.__terminal(l), self.__terminal(r)
            return
        if len(l) <= self.min_samples_leaf:
            node['l'] = self.__terminal(l)
        else:
            node['l'] = self.__split(l)
            self.split(node['l'], depth + 1)
        if len(r) <= self.min_samples_leaf:
            node['r'] = self.__terminal(r)
        else:
            node['r'] = self.__split(r)
            self.split(node['r'], depth + 1)

    def print_tree(self, node, depth=0):
        if isinstance(node, dict):
            print('%s[X%d < %.3f]' % ((depth * ' ', (node['index'] + 1), node['value'])))
            self.print_tree(node['l'], depth + 1)
            self.print_tree(node['r'], depth + 1)
        else:
            print('%s[%s]' % ((depth * ' ', node)))

    def __predict(self, row, node=None):
        if node is None:
            node = self.root
        if row[node['index']] < node['value']:
            if isinstance(node['l'], dict):
                return self.__predict(row, node['l'])
            else:
                return node['l']
        else:
            if isinstance(node['r'], dict):
                return self.__predict(row, node['r'])
            else:
                return node['r']

    def predict(self, X):
        allres = []
        for row in X:
            res = self.__predict(row)
            allres.append(res)
        return np.array(allres)

In [15]:
tr = DesTree()
tr.fit(features_train, target_train)
tt = tr.predict(features_test)
print('My decision tree accuracy: {}'.format(accuracy_score(tt, target_test)))

My decision tree accuracy: 0.753125


In [16]:
dt = DecisionTreeClassifier(max_depth=7)
dt.fit(features_train, target_train)
print('sklearn decision tree accuracy: {}'.format(dt.score(features_test, target_test) * 100))

sklearn decision tree accuracy: 70.625


## SVM. Метод опорных векторов

In [17]:
def projection_simplex(v, z=1):
    n_features = v.shape[0]
    u = np.sort(v)[::-1]
    cssv = np.cumsum(u) - z
    ind = np.arange(n_features) + 1
    cond = u - cssv / ind > 0
    rho = ind[cond][-1]
    theta = cssv[cond][-1] / float(rho)
    w = np.maximum(v - theta, 0)
    return w

In [18]:
class SVM:

    def __init__(self, C=1, max_iter=100, eps=0.01, random_state=None, verbose=0):
        self.C = C
        self.max_iter = max_iter
        self.eps = eps
        self.random_state = random_state
        self.verbose = verbose

    def partial_gradient(self,f, t, i):
        g = np.dot(f[i], self.coef.T) + 1
        g[int(t[i])] -= 1
        return g

    def violation(self, g, t, i):
        smallest = np.inf
        for k in range(g.shape[0]):
            if k == t[i] and self.dual_coef[k, i] >= self.C:
                continue
            elif k != t[i] and self.dual_coef[k, i] >= 0:
                continue

            smallest = min(smallest, g[k])
        return g.max() - smallest

    def solver(self, g, t, norms, i):
        Ci = np.zeros(g.shape[0])
        Ci[int(t[i])] = self.C
        beta_hat = norms[i] * (Ci - self.dual_coef[:, i]) + g / norms[i]
        z = self.C * norms[i]
        beta = projection_simplex(beta_hat, z)
        return Ci - self.dual_coef[:, i] - beta / norms[i]

    def fit(self,f, t):
        n_samples, n_features =f.shape
        n_classes = 4
        self.dual_coef = np.zeros((n_classes, n_samples), dtype=np.float64)
        self.coef = np.zeros((n_classes, n_features))
        norms = np.sqrt(np.sum(f ** 2, axis=1))
        rs = check_random_state(self.random_state)
        ind = np.arange(n_samples)
        rs.shuffle(ind)
        violation_init = None
        for it in range(self.max_iter):
            violation_sum = 0
            for idx in range(n_samples):
                i = ind[idx]

                if norms[i] == 0:
                    continue
                g = self.partial_gradient(f, t, i)
                v = self.violation(g, t, i)
                violation_sum += v
                if v < 1e-12:
                    continue
                delta = self.solver(g, t, norms, i)
                self.coef += (delta *f[i][:, np.newaxis]).T
                self.dual_coef[:, i] += delta
            if it == 0:
                violation_init = violation_sum
            vratio = violation_sum / violation_init
            if self.verbose >= 1:
                print("iter", it + 1, "violation", vratio)
            if vratio < self.eps:
                if self.verbose >= 1:
                    print("Converged")
                break
        return self

    def predict(self,f):
        decision = np.dot(f, self.coef.T)
        pred = decision.argmax(axis=0)
        return pred

    def get_coeff(self, features, target):
        cnt = 0
        for i in range(target.shape[0]):
            if self.predict(features[i]) == target[i]:
                cnt += 1
        return cnt / target.shape[0]

In [19]:
svm = SVM()
svm.fit(features_train, target_train)
print('My SVM: {}'.format(svm.get_coeff(features_test, target_test) * 100))

My SVM: 71.25


In [20]:
svc = SVC(decision_function_shape='ovr')
svc.fit(features_train, target_train)
print('sklearn SVM: {}'.format(svc.score(features_test, target_test) * 100))

sklearn SVM: 71.875
