In [6]:
import numpy as np
from scipy.optimize import minimize
import math
from random import random, randint, shuffle
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression as PythonLogisticRegression
from sklearn.neighbors import KNeighborsClassifier as PythonKNN
from sklearn.svm import SVC as PythonSVM
from sklearn.tree import DecisionTreeClassifier as PythonDecisionTree
from sklearn.ensemble import RandomForestClassifier as PythonRandomForest
from sklearn import metrics
from collections import namedtuple
import heapq

In [168]:
class LogRegr:
    def __init__(self, epochs=100):
        self._pred_substruc = None
        self.__epochs = epochs

    def fit(self, features, targets):
        class Similarity_Function:
            def __init__(self, x, y):
                one = np.fromiter((1 for _ in range(x.shape[0])), dtype=float)
                self._x = np.append(one[:,np.newaxis], x, axis=1)
                self._y = y

            def __call__(self, b):
                polynome = np.sum(self._x * b, axis=1)
                prob = 1 / (1 + np.exp(-polynome))
                return -(np.sum(self._y * np.log(prob) + (1 - self._y) * np.log(1 - prob)))

        class SF_Jacobian(Similarity_Function):
            def __call__(self, b):
                polynome = np.sum(self._x * b, axis=1)
                mul = np.sum(self._y - 1 / (1 + np.exp(-polynome)))
                print("jac mul:", mul)
                return -b * mul

        class Pred_substruc:
            def __init__(self, b):
                self.__b = b

            def __call__(self, x):
                one = np.array([1] * x.shape[0])
                x_axis = np.append(one[:,np.newaxis], x, axis=1)
                polynome = np.sum(x_axis * self.__b, axis=1)
                return 1 / (1 + np.exp(-polynome))

        start = (np.random.random(features.shape[1] + 1) - 0.5)

        self.__coefs = minimize(
            Similarity_Function(features, targets), start, method='CG', 
#             jac=SF_Jacobian(features, targets),
            options={'maxiter': self.__epochs}
        ).x

        self._pred_substruc = Pred_substruc(self.__coefs)
        self.__threshold = 0.5

    def predict(self, features):
        if not self._pred_substruc is None and self.__coefs.size == features.shape[1] + 1:
            return (self._pred_substruc(features) > self.__threshold) * 1
        

In [89]:
class KNN:
    def __init__ (self, k):
        self.k = k
        self.classes = 2
        self.features = None
        self.targets = None
        
    def dist(self, a, b):
        root_pow = len(a)
        dist = 0
        for elem in a:
            if (elem < 10): elem *= 100
            dist += elem * elem
            
        dist = pow(dist, 1/root_pow)
        return dist
    
    def fit (self, features, targets):
        self.features = features
        self.targets = targets
        
    
    def predict(self, test_data):
        
        testLabels = []
        
        for testPoint in test_data:
            #Claculate distances between test point and all of the train points
            testDist = [ [self.dist(testPoint, self.features[i]), self.targets[i]] for i in range(len(self.features))]
#             print(testDist)
            #How many points of each class among nearest K
            stat = [0 for i in range(self.classes)]
            for d in sorted(testDist)[0:self.k]:
                stat[d[1]] += 1
            #Assign a class with the most number of occurences among K nearest neighbours
            testLabels.append( sorted(zip(stat, range(self.classes)), reverse=True)[0][1] )
        return testLabels

In [34]:
class my_SVM():
    def __init__(self, epochs=5, a=0.5):
        self.__epochs = epochs
        self.__a = a
        self.__predicator = None
        
    def fit(self, features, targets):
        class LossFunc:
            def __init__(self, x, y, a):
                fi = np.array([1] * x.shape[0])
                self._x = np.append(fi[:,np.newaxis], x, axis=1)
                self._y = y
                self._a = a
                self._i = 0
            
            def _iterate(self):
                self._i += 1
                if self._i == len(self._x):
                    self._i = 0
            
            def __call__(self, w):
                res = max(0.0, 1.0 - self._y[self._i] * np.sum(w * self._x[self._i])) + self._a * np.sum(w * w) / 2
                self._iterate()
                return res
            
        class Jac(LossFunc):
            def __call__(self, w):
                if self._y[self._i] * np.sum(w * self._x[self._i]) < 1:
                    res = self._a * w - self._y[self._i] * self._x[self._i]
                else:
                    res = self._a * w
                self._iterate()
                return res
            
        class Predicator:
            def __init__(self, w):
                self.__w = w

            def __call__(self, x):
                fi = np.array([1] * x.shape[0])
                xa = np.append(fi[:,np.newaxis], x, axis=1)
                return np.sum(self.__w * xa, axis=1)
            
        start = np.random.random(features.shape[1] + 1) - 0.5

        m_func = LossFunc(features, targets, self.__a)
        m_jac = Jac(features, targets, self.__a)

        for i in range(self.__epochs):
            for j in range(features.shape[0]):
                self.__coefs = minimize(
                    m_func, start, method='CG', 
                    jac=m_jac,
                    options={'maxiter': 1}
                ).x

        self.__predicator = Predicator(self.__coefs)
        
    def predict(self, features):
        if not self.__predicator is None and self.__coefs.size == features.shape[1] + 1:
            return (self.__predicator(features) > 0) * 1
    



In [11]:
class DecisionTreeLeaf:
    def __init__(self, decision):
        self.__decision = decision

    def predict(self, point):
        return self.__decision

class DecisionTreeNode:
    def __init__(self, feature, separator, left, right):
        self.__feature = feature
        self.__separator = separator
        self.__left = left
        self.__right = right

    def predict(self, point):
        if point[self.__feature] <= self.__separator:
            return self.__left.predict(point)
        else:
            return self.__right.predict(point)

class DecisionTree:
    def __init__(self, depth=None):
        self.__root = None
        self.__depth = depth

    def fit(self, features, targets):
        self.__root = self.__construct(features, targets)

    def predict(self, points):
        if self.__root is not None:
            ans = []
            for point in points:
                ans.append(self.__root.predict(point))
            return np.array(ans)

    def __construct(self, features, targets, depth=0):
        unique_targets = np.unique(targets)
        if unique_targets.size == 1:
            return DecisionTreeLeaf(unique_targets[0])

        if not self.__depth is None and self.__depth == depth:
            return DecisionTreeLeaf(self.__get_freq_target(targets))

        max_gain = None
        for i in range(features.shape[1]):
            curr_gain, curr_sep = self.__information_gain(features[:,i], targets)
            if max_gain is None or curr_gain > max_gain:
                max_ind = i
                max_sep = curr_sep
                max_gain = curr_gain

        left = ([], [])
        right = ([], [])
        for i in range(features.shape[0]):
            if features[i, max_ind] <= max_sep:
                left[0].append(features[i])
                left[1].append(targets[i])
            else:
                right[0].append(features[i])
                right[1].append(targets[i])

        if len(left[0]) == 0 or len(right[0]) == 0:
            return DecisionTreeLeaf(self.__get_freq_target(targets))

        return DecisionTreeNode(
            feature=max_ind,
            separator=max_sep,
            left=self.__construct(np.array(left[0]), np.array(left[1]), depth + 1),
            right=self.__construct(np.array(right[0]), np.array(right[1]), depth + 1)
        
        )

    def __information_gain(self, features, targets):
        targets_counts = {}
        for target in targets:
            if target not in targets_counts:
                targets_counts[target] = 0
            targets_counts[target] += 1

        sorted_targets_inds = np.argsort(features)
        values = np.sort(features)

        optimal_separator = None
        curr_targets_counts = {}
        for target in targets_counts:
            curr_targets_counts[target] = 0

        for i in range(len(targets) - 1):
            curr_targets_counts[targets[sorted_targets_inds[i]]] += 1

            curr_information_ammount = 0
            for target in targets_counts:
                p1 = curr_targets_counts[target] / (i + 1)
                p2 = (targets_counts[target] - curr_targets_counts[target]) / (len(targets) - (i + 1))

                if p1 > 0:
                    curr_information_ammount -= p1 * math.log(p1, 2)
                if p2 > 0:
                    curr_information_ammount -= p2 * math.log(p2, 2)

            if optimal_separator is None or curr_information_ammount < optimal_information_ammount:
                optimal_separator = values[i]
                optimal_information_ammount = curr_information_ammount

        base_information_ammount = 0
        for target_count in targets_counts.values():
            p = target_count / len(features)
            base_information_ammount -= p * math.log(p, 2)

        return (base_information_ammount - optimal_information_ammount, optimal_separator)

    def __get_freq_target(self, targets):
        targets_count = {}

        for target in targets:
            if not target in targets_count:
                targets_count[target] = 0
            targets_count[target] += 1

        best_target = None
        for curr_target in targets_count:
            if best_target is None or targets_count[best_target] < targets_count[curr_target]:
                best_target = curr_target

        return best_target

In [12]:
class RandomForest:
    def __init__(self, trees_count=None, depth=None, effective_factors=None):
        self.__trees_count = trees_count
        self.__depth = depth
        self.__effective_factors = effective_factors
        self.__trees = None

    def fit(self, features, targets):
        if self.__effective_factors is None:
            self.__effective_factors = math.floor(math.sqrt(features.shape[1]))
        if self.__effective_factors > features.shape[1]:
            self.__effective_factors = features.shape[1]

        if self.__trees_count is None:
            self.__trees_count = math.ceil(math.sqrt(features.shape[1]))

        self.__trees = []

        for i in range(self.__trees_count):
            points_inds = [randint(0, features.shape[0] - 1) for i in range(features.shape[0])]
            tmp = [i for i in range(features.shape[1])]
            shuffle(tmp)
            targets_inds = sorted(tmp[:self.__effective_factors])

            curr_points = []
            curr_targets = []
            for ind in points_inds:
                curr_points.append(features[ind])
                curr_targets.append(targets[ind])

            curr_points = np.array(curr_points)
            curr_targets = np.array(curr_targets)

            trunc_factors = curr_points[:, targets_inds[0], np.newaxis]
            for ind in targets_inds[1:]:
                np.append(trunc_factors, curr_points[:, ind, np.newaxis], axis=1)

            curr_tree = DecisionTree(self.__depth)
            curr_tree.fit(trunc_factors, curr_targets)
            self.__trees.append(curr_tree)

    def predict(self, points):
        ans = []
        for point in points:
            votes = {}
            for tree in self.__trees:
                pred = tree.predict([point])[0]
                if pred not in votes:
                    votes[pred] = 0
                votes[pred] += 1

            predicted_class = None
            for class_id, votes_count in votes.items():
                if predicted_class is None or votes[predicted_class] < votes_count:
                    predicted_class = class_id

            ans.append(predicted_class)
        return np.array(ans)

In [13]:
data = pd.read_csv(os.path.join("C:/Users/cerma/Desktop/MAI/3c/AI/lab1/classification_data/star_class_ready.csv"))

In [131]:
probs = np.random.rand(len(data))
sampling_rate = 0.8
training_mask = probs < sampling_rate
test_mask = probs >= sampling_rate

learn_data = data[training_mask]
learn_features = learn_data.drop("Star type", axis='columns').to_numpy()
learn_targets = learn_data["Star type"].to_numpy()
for i in range(0, len(learn_targets)):
    if learn_targets[i] > 2:
        learn_targets[i] = 1
    else :
        learn_targets[i] = 0

test_data = data[test_mask]
test_features = test_data.drop("Star type", axis='columns').to_numpy()
test_targets = test_data["Star type"].to_numpy()
for i in range(0, len(test_targets)):
    if test_targets[i] > 2:
        test_targets[i] = 1
    else :
        test_targets[i] = 0

In [176]:
def learn_w_metrics(model, learn_features, learn_targets, test_features, test_targets, name):
    model.fit(learn_features, learn_targets)
    res = model.predict(test_features)
    print("{} metrics:".format(name))
    print("accuracy:", metrics.accuracy_score(test_targets, res))
    print("precision:", metrics.precision_score(test_targets, res))
    print("f1 score:", metrics.f1_score(test_targets, res))
    print("recall:", metrics.recall_score(test_targets, res))
    print()

In [177]:
learn_w_metrics(LogRegr(), learn_features, learn_targets, test_features, test_targets, "My logistic regression")

My logistic regression metrics:
accuracy: 0.8723404255319149
precision: 1.0
f1 score: 0.8571428571428571
recall: 0.75



  from ipykernel import kernelapp as app
  app.launch_new_instance()
  app.launch_new_instance()


In [178]:
learn_w_metrics(PythonLogisticRegression(max_iter=500), learn_features, learn_targets, test_features, test_targets, "Python logistic regression")

Python logistic regression metrics:
accuracy: 1.0
precision: 1.0
f1 score: 1.0
recall: 1.0



In [220]:
knn_sampling_rate = 0.8

knn_learn_probs = np.random.rand(len(learn_features))
taken_mask = knn_learn_probs < knn_sampling_rate
knn_learn_features = learn_features[taken_mask]
knn_learn_targets = learn_targets[taken_mask]
# print(knn_learn_targets)

knn_test_probs = np.random.rand(len(test_features))
taken_mask = knn_test_probs < knn_sampling_rate
knn_test_features = test_features[taken_mask]
knn_test_targets = test_targets[taken_mask]
# print(knn_test_targets)

learn_w_metrics(KNN(50), knn_learn_features, knn_learn_targets, knn_test_features, knn_test_targets, "My KNN")

My KNN metrics:
accuracy: 0.4878048780487805
precision: 0.0
f1 score: 0.0
recall: 0.0



In [186]:
learn_w_metrics(PythonKNN(), learn_features, learn_targets, test_features, test_targets, "Python KNN")

Python KNN metrics:
accuracy: 0.9148936170212766
precision: 1.0
f1 score: 0.9090909090909091
recall: 0.8333333333333334



In [193]:
learn_w_metrics(my_SVM(1), learn_features, learn_targets, test_features, test_targets, "My SVM")

My SVM metrics:
accuracy: 0.3191489361702128
precision: 0.39473684210526316
f1 score: 0.48387096774193544
recall: 0.625



In [194]:
learn_w_metrics(PythonSVM(max_iter=1000), learn_features, learn_targets, test_features, test_targets, "Python SVM")

Python SVM metrics:
accuracy: 0.8723404255319149
precision: 1.0
f1 score: 0.8571428571428571
recall: 0.75



In [195]:
learn_w_metrics(DecisionTree(), learn_features, learn_targets, test_features, test_targets, "My decision tree")

My decision tree metrics:
accuracy: 1.0
precision: 1.0
f1 score: 1.0
recall: 1.0



In [196]:
learn_w_metrics(PythonDecisionTree(), learn_features, learn_targets, test_features, test_targets, "Python decision tree")

Python decision tree metrics:
accuracy: 1.0
precision: 1.0
f1 score: 1.0
recall: 1.0



In [203]:
learn_w_metrics(RandomForest(depth=4), learn_features, learn_targets, test_features, test_targets, "My random forest")

My random forest metrics:
accuracy: 0.7021276595744681
precision: 0.631578947368421
f1 score: 0.7741935483870968
recall: 1.0



In [198]:
learn_w_metrics(PythonRandomForest(), learn_features, learn_targets, test_features, test_targets, "Python random forest")

Python random forest metrics:
accuracy: 1.0
precision: 1.0
f1 score: 1.0
recall: 1.0

