# Лабораторная работа 2

В этой лабораторной работе необхадимо сделать свой реализацию алгоритмов машинного обучения, и сравнить её с реализацией на sklearn для 2 датасетов, полученных в предыдущей лабораторной работе.

In [2]:
import pandas
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import check_random_state
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import time
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
import sys

Заготавливаем функцию, которая будет отвечать за обучение, тестирование, подсчёт точности и др

In [10]:
def DisplayMetrics(Method, X, Y, folds = 5, average = 'macro'):
  kf = StratifiedKFold(n_splits = folds, random_state = 128, shuffle = True)
  precision = np.zeros(folds)   
  recall = np.zeros(folds)  
  testAc = np.zeros(folds)
  trainAc = np.zeros(folds)
  X=X.astype(np.float32)
  for step, (trainI, valI) in enumerate(kf.split(X, Y)):
    TrX, TrY = X.loc[trainI].to_numpy(), Y.loc[trainI].to_numpy()
    ValX, ValY = X.loc[valI].to_numpy(), Y.loc[valI].to_numpy()
    Method.fit(TrX, TrY)
    PredY = Method.predict(ValX)
    PredTrY = Method.predict(TrX)
    precision[step] = precision_score(ValY, PredY, average = average)
    recall[step] = recall_score(ValY, PredY, average = average)
    trainAc[step] = accuracy_score(TrY, PredTrY)
    testAc[step] = accuracy_score(ValY, PredY)
  print("precision:", precision.mean())
  print("recall:", recall.mean())
  print("train_accuracy:", trainAc.mean())
  print("test_accuracy:", testAc.mean())

# Реализация своими руками

Логистическая регрессия

In [11]:
class LogReg():
    def __init__(self, learning_rate = 0.01, grad_iters=100):
        self.lr = learning_rate
        self.gi = grad_iters

    def __sigmoid(self, x):
        return 1.0 / (1.0 + np.e ** (-x))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def __add_intercept(self, X):
        return np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)

    def fit(self, X, y):
        #print(X.dtype)
        X = self.__add_intercept(X)
        self.w = np.zeros(X.shape[1])

        for _ in range(self.gi):
            h = self.__sigmoid(np.dot(X, self.w))
            g = np.dot(X.T, (h - y)) / y.size
            self.w -=  (self.lr * g).astype(np.float64)
        pass

    def __predict_probability(self, X):
        X = self.__add_intercept(X)
        return self.__sigmoid(np.dot(X, self.w))

    def predict(self, X, threshold=0.5):
        return self.__predict_probability(X) >= threshold

KNN

In [12]:
class KNN():
    def __init__(self, neighbors=5):
        self.nn = neighbors
        self.breakraiser = 0
    
    def fit(self, X, y):
        self.X = X
        self.y = y.reshape((y.shape[0], 1))

    def __get_distances(self, p):
        t = (self.X - p)
        return np.sqrt((t**2).sum(1))

    def predict(self, X):
        n = X.shape[0]
        y_pred = np.zeros(n)
        y_sorted = np.zeros(self.nn)
        countF=0
        for i in range(n):
            d = self.__get_distances(X[i])
            for it,val in enumerate(np.argpartition(d, self.nn)[:self.nn]):
                y_sorted[it]=self.y[val]
            y_pred[i] = Counter(y_sorted).most_common(1)[0][0]
        return y_pred

Дерево решений

In [13]:
class Node():
  def __init__(self, predType):
    self.predType = predType
    self.iFeature = 0
    self.border = 0
    self.left = None
    self.right = None

class DecisionTree():
  def __init__(self, mDepth = 1, rf = False):
    self.mDepth = mDepth
    self.rf = rf

  def fit(self, X, y, maxFeatures = None):
    self.sizeY = len(set(y))
    self.setY  = set(y)
    self.dictY = {t:i for i,t in enumerate(self.setY)}
    #print(self.dictY)
    if not self.rf:
      Features = X.shape[1]
    else:
      ind = np.random.choice(X.shape[0], X.shape[0])
      X, y = X[tuple([ind])], y[tuple([ind])]
      if maxFeatures is None:
        Features = np.sqrt(X.shape[1]).astype(int)
      else:
        Features = maxFeatures
    self.features = np.sort(np.random.choice(X.shape[1], Features, replace = False))
    self.tree = self.UpdateTree(X, y)

  def predict(self, X):
    list = []
    for inputs in X:
      node = self.tree
      while node.left:
        if inputs[node.iFeature] < node.border:
          node = node.left
        else:
          node = node.right
      list.append(node.predType)
    return list

  def Split(self, X, y):
    m = y.size    
    if m <= 1:
      return None, None
    parent = [np.sum(y == c) for c in self.setY]
    bGini = 1.0 - sum((n / m) ** 2 for n in parent)
    bIdx, bThr = None, None
    dic=self.dictY
    for idx in self.features:
      borders, types = zip(*sorted(zip(X[:, idx], y)))
      left = [0] * self.sizeY
      right = parent.copy()
      for i in range(1, m):
        c = types[i - 1]
        #print(c)
        right[dic[c]] -= 1
        left[dic[c]] += 1
        giniLeft = 1.0 - sum((left[dic[x]] / i) ** 2 for x in self.setY)
        giniRight = 1.0 - sum((right[dic[x]] / (m - i)) ** 2 for x in self.setY)
        gini = (i * giniLeft + (m - i) * giniRight) / m
        if borders[i] == borders[i - 1]:
          continue
        if gini < bGini:
          bGini = gini
          bIdx = idx
          bThr = (borders[i] + borders[i - 1]) / 2
    return bIdx, bThr

  def UpdateTree(self, X, y, depth = 0):
    sPerClass = [np.sum(y == i) for i in self.setY]
    predType = np.argmax(sPerClass)
    node = Node(predType = predType)
    if depth < self.mDepth:      
      idx, thr = self.Split(X, y)
      if idx is not None:
        Lidx = X[:, idx] < thr
        lx, ly = X[Lidx], y[Lidx]
        rx, ry = X[~Lidx], y[~Lidx]
        node.iFeature = idx
        node.border = thr
        node.left = self.UpdateTree(lx, ly, depth + 1)
        node.right = self.UpdateTree(rx, ry, depth + 1)
    return node


Случайный лес

In [14]:
class RandomForest():
    def __init__(self, max_depth=5, n_estimators=100, max_features=None):
        self.max_depth = max_depth
        self.max_features = max_features
        self.n_estimators = n_estimators
        self.forest = [None] * n_estimators

    def fit(self, X, y):
        for i in range(self.n_estimators):
            self.forest[i] = DecisionTree(
                self.max_depth, rf=True) 
            self.forest[i].fit(X, y)


    def predict(self, X):
        most_common = np.zeros(X.shape[0])
        preds = np.zeros((self.n_estimators, X.shape[0]))
        for i in range(self.n_estimators):
            preds[i] = self.forest[i].predict(X)
        for i in range(len(most_common)):
            most_common[i] = Counter(preds[:, i]).most_common(1)[0][0]
        return most_common.astype(int)

# Первый датасет - Flats

Для этого датасета мы попробуем определить, будет ли квартира стоить больше 50 единиц.

In [31]:
pr= pandas.read_csv("/Users/dmitry/AI_2/flats_moscow.csv")
required = list(pr)
del required[required.index('price')]
y = pr["price"]//50
x = pandas.get_dummies(pr[required])
x.head()

Unnamed: 0.1,Unnamed: 0,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,code
0,1,58,40,6.0,12.5,7,1,1,1,3
1,2,44,28,6.0,13.5,7,1,0,1,6
2,3,70,42,6.0,14.5,3,1,1,1,3
3,4,61,37,6.0,13.5,7,1,0,1,1
4,5,104,60,11.0,10.5,7,0,1,1,3


# Логистичкская регрессия

sklearn

In [32]:
%%time
DisplayMetrics(LogisticRegression(), x, y)

precision: 0.20628272876431578
recall: 0.1725683569210668
train_accuracy: 0.6194852941176471
test_accuracy: 0.6215686274509804
CPU times: user 874 ms, sys: 32.1 ms, total: 906 ms
Wall time: 1.21 s


своими руками

In [20]:
%%time
DisplayMetrics(LogReg(grad_iters=100), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 161 ms, sys: 15.9 ms, total: 177 ms
Wall time: 248 ms


# KNN

sklearn

In [22]:
%%time
DisplayMetrics(KNeighborsClassifier(n_neighbors=5), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 536 ms, sys: 9.73 ms, total: 545 ms
Wall time: 652 ms


своими руками

In [23]:
%%time
%prun KNN
DisplayMetrics(KNN(neighbors = 3), x, y)

 precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 1.02 s, sys: 36.3 ms, total: 1.06 s
Wall time: 1.38 s


# Дерево решений

sklearn

In [24]:
%%time
DisplayMetrics(DecisionTreeClassifier(max_depth=5), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 38.4 ms, sys: 3.64 ms, total: 42 ms
Wall time: 48.9 ms


своими руками

In [25]:
%%time
DisplayMetrics(DecisionTree(mDepth = 4), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 684 ms, sys: 26.5 ms, total: 711 ms
Wall time: 890 ms


# Случайный лес

sklearn

In [26]:
%%time 
DisplayMetrics(RandomForestClassifier(n_estimators=50, max_depth=2), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 747 ms, sys: 17.9 ms, total: 765 ms
Wall time: 1.01 s


своими руками

In [27]:
%%time
DisplayMetrics(RandomForest(), x, y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 18.2 s, sys: 212 ms, total: 18.4 s
Wall time: 19.7 s


# Второй датасет

In [33]:
carpr= pandas.read_csv("/Users/dmitry/AI_2/carsclean.csv")
required = list(carpr)
del required[required.index('price')]
Y = carpr['price']//50000
X = pandas.get_dummies(carpr[required])
X.head()

Unnamed: 0,year,kilometers,city_Al Riyadh,car_maker_BMW,car_maker_Cadillac,car_maker_Dodge,car_maker_Ford,car_maker_GMC,car_maker_Kia,car_maker_Mazda,...,color_Beige,color_Black,color_Brown,color_Green,color_Grey,color_Silver,color_White,pay_method_Cash Only,pay_method_Cash or Installments,pay_method_Installments Only
0,2019,14999.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,2006,0.0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
2,2014,24999.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,2010,200000.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,2018,5000.0,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0


# Логистическая регрессия

sklearn

In [42]:
%%time
DisplayMetrics(LogisticRegression(), X, Y)

precision: 0.3744085371992349
recall: 0.4427380952380953
train_accuracy: 0.75
test_accuracy: 0.75
CPU times: user 156 ms, sys: 3.92 ms, total: 160 ms
Wall time: 166 ms


своими руками

In [43]:
%%time
DisplayMetrics(LogReg(), X, Y)

precision: 0.31860041207949896
recall: 0.3597619047619047
train_accuracy: 0.6205357142857143
test_accuracy: 0.625
CPU times: user 93.3 ms, sys: 9.44 ms, total: 103 ms
Wall time: 72.4 ms


# KNN

sklearn

In [44]:
%%time
DisplayMetrics(KNeighborsClassifier(n_neighbors=5), X, Y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 169 ms, sys: 3.53 ms, total: 172 ms
Wall time: 178 ms


своими руками

In [45]:
%%time
DisplayMetrics(KNN(neighbors = 8), X, Y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 235 ms, sys: 6.45 ms, total: 242 ms
Wall time: 250 ms


# Дерево решений

sklearn

In [46]:
%%time
DisplayMetrics(DecisionTreeClassifier(max_depth=15), X, Y)

precision: 1.0
recall: 1.0
train_accuracy: 1.0
test_accuracy: 1.0
CPU times: user 38.3 ms, sys: 3.16 ms, total: 41.4 ms
Wall time: 43.8 ms


своими руками

In [47]:
%%time
DisplayMetrics(DecisionTree(), X, Y)

precision: 0.5253172030919347
recall: 0.5380952380952381
train_accuracy: 0.8214285714285714
test_accuracy: 0.8214285714285714
CPU times: user 1.11 s, sys: 12.5 ms, total: 1.13 s
Wall time: 1.26 s


#  Случайны лес

sklearn

In [34]:
%%time 
DisplayMetrics(RandomForestClassifier(n_estimators=50, max_depth=2), X, Y)


precision: 0.671639560662117
recall: 0.5547619047619048
train_accuracy: 0.8352678571428573
test_accuracy: 0.8375
CPU times: user 610 ms, sys: 12.9 ms, total: 623 ms
Wall time: 710 ms


своими руками

In [35]:
%%time
DisplayMetrics(RandomForest(max_depth=2,n_estimators=2), X, Y)

precision: 0.6946917173004129
recall: 0.5864285714285715
train_accuracy: 0.7839285714285714
test_accuracy: 0.7928571428571429
CPU times: user 806 ms, sys: 15.2 ms, total: 821 ms
Wall time: 1.22 s


# Вывод

Реализация своими руками сильно уступает в скорости реализации на sklearn. Причиной этому может быть как неоптимизированность моего кода(например привидения типа X к float32 дало прибавку в скорости в 8 раз), так и хорошая оптимизация кода sklearn. За исключением времени, все показатели оказалась на уровне с sklearn во всех случаях, кроме логистической регресии, вероятно я сильно накосячил в её реализации. 

Пожалуй главным выводом будет никогда не пользоваться реализацией своими руками, т.к. она очень медленная.