# Prepare data

# TODO
1. Добавить страховку на случай, если какое-то дерево обучалось не на всех возможных классах и, как следствие, возвращает предсказание для меньшего количества классов

In [15]:
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np 
import random
from sklearn import tree
from scipy.stats import bernoulli
import math
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.ensemble import RandomForestClassifier
import copy
import sys

In [16]:
def read_binary_pendigits(path_to_data):
    df = load_svmlight_file(path_to_data)
    features = df[0].todense().view(type=np.ndarray)
    target = df[1].astype(np.int)
    # classification task is to distinguish between 4 and 9
    condition = np.logical_or((target==9),(target==4))
    x = features[condition,:]
    y = target[condition]
    # label is 0, when the image depicts 4, label is 1 otherwise
    y[y == 4] = 0
    y[y == 9] = 1
    return x, y

In [17]:
def split_data(x, y, random_state, train_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=random_state)
    x_l_train, x_u_train, y_l_train, y_u_train = train_test_split(x_train, y_train, train_size=train_size, random_state=random_state)
    return x_l_train, y_l_train, x_u_train, x_test, y_test

# Random Forest Subclasses

In [18]:
class RFTree:
    def train(self, x, y, random_state):
        self.clf = None
        self.clf = tree.DecisionTreeClassifier(random_state=random_state, max_features="sqrt", splitter="random") # add random state
        self.clf.fit(x, y)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def count_oobe(self, x, y):
        y_predicted = self.clf.predict(x)
        return 1-accuracy_score(y, y_predicted)
    

In [19]:
class RandomForest:
    def __init__(self, N = 100):
        self.N = N
        self.trees = []
        self.oobe = 0
        
    def _prepare_train_data(self, x, y):
        n_samples = len(y)
        steps = 10
        st = 0
        x_i = []
        y_i = []
        choices = []
        while st < steps:
            choices = random.choices(range(0, len(y)), k=n_samples) # indices of samples
            y_i = [y[i] for i in choices]
            st += 1
            if y_i.count(y_i[0]) != len(y_i):
                break
        if st == steps:
            sys.exit("We couldn't generate good data for RF")
        x_i = [x[i] for i in choices]
        return x_i, y_i, choices
    
    def _prepare_oob_data(self, x, y, indices):
        x_i = [x[i] for i in range(0, len(x)) if i not in indices]
        y_i = [y[i] for i in range(0, len(x)) if i not in indices]
        return x_i, y_i
    
    def train(self, x, y, random_state):
        random.seed(random_state)
        self.trees = []
        self.oobe = 0
        for i in range(0, self.N):
            rfTree = RFTree()
            x_i, y_i, idx = self._prepare_train_data(x, y)
            #print("Train tree with features of size: ", len(x_i[0]))
            rfTree.train(x_i, y_i, random_state)
            x_oob, y_oob = self._prepare_oob_data(x, y, idx)
            self.trees.append(rfTree)
            self.oobe += rfTree.count_oobe(x_oob, y_oob)
        
        self.oobe /= self.N
    
    def get_oobe(self):
        return self.oobe
    
    def predict_proba(self, x):
        target = self.trees[0].predict_proba(x)
        for i in range(1, self.N):
            target = np.add(self.trees[i].predict_proba(x), target)        
        for i in range(0, len(target)):
            target[i] = [x / self.N for x in target[i]]
        
        return target.tolist()
    
    def predict_old(self, x):
        target = self.predict_proba(x)
        res = []
        for i in range(0, len(target)):
            res.append(target[i].index(max(target[i])))
        return res
    
    def predict(self, x):
        y = self.trees[0].predict(x)
        for i in range(1, self.N):
            y = np.add(self.trees[i].predict(x), y)
        res = []
        
        for i in range(0, len(x)):
            if y[i] > self.N/2:
                res.append(1)
            else:
                res.append(0)
        return res

In [31]:
class SemiSupervisedRandomForest(RandomForest):
    def __init__(self):
        super().__init__()

    def __change_distribution(self, probs, alpha, T):
        newprobs = []
        for i in range(0, len(probs)):
            Z = 0
            newprobs_i = []
            for p in probs[i]:
                #newp = p**(-alpha/T)/np.exp(1) if p > 0 else 0
                newp = np.exp(-(alpha*np.log2(p) + T)/T) if p > 0 else 0
                newprobs_i.append(newp)
                Z += newp
            
            if np.isinf(Z) or np.any(np.isinf(newprobs_i)):
                return [], True
            newprobs_i = [p/Z for p in newprobs_i]
            newprobs.append(newprobs_i)
        return newprobs, False
    
    def __retrain_random_forest(self, x_l, y_l, x_u, p, random_state):
        oobe = 0
        np.random.seed(random_state)
        for i in range(0, self.N):
            y_hat = np.random.binomial(1, p)
            x_train = np.concatenate((x_l, x_u))
            y_train = np.concatenate((y_l, y_hat))
            rfTree = self.trees[i]
            for j in range(0, 10):
                x_i, y_i, idx = self._prepare_train_data(x_train, y_train)
                x_oob, y_oob = self._prepare_oob_data(x_l, y_l, idx)
                if len(x_oob) == 0:
                    pass
                else:
                    rfTree.train(x_i, y_i, random_state)
                    oobe += rfTree.count_oobe(x_oob, y_oob)
                    break
                if j == 9:
                    sys.exit("WARNING: We couldnt find good data fot tree#"+str(i))
        oobe /= self.N
        return oobe
    
    def train(self, x_l, y_l, x_u, random_state, T0, alpha, c0):
        random.seed(random_state)
        steps = 20
        #T0 = 0.2 # T_m ~ T0*exp^(-m) - cooling function
        #alpha = 0.1
        super().train(x_l, y_l, random_state) # train RF with labeled data
        Told = T0
        m = 0 # set epoch
        oobe = 0
        while True:
            Tnew = Told/c0
            m = m + 1
            target = self.predict_proba(x_u)
            newtarget, is_overflow = self.__change_distribution(target, alpha, Tnew)
            if is_overflow:
                #print("Overflow happened, stopped after ", m, " steps")
                break
            p = [x[1] for x in newtarget]
            
            oobe = self.__retrain_random_forest(x_l, y_l, x_u, p, random_state)
            Told = Tnew
            if m >= steps:
                break
        if oobe > self.oobe:
            print("[D]Semi-supervised approach was discarded with oobe: "+str(oobe)+", oobe for pure RF: "+str(self.oobe))
            random.seed(random_state)
            super().train(x_l, y_l, random_state)
            
        else:
            print("[A]Semi-supervised approach was accepted with oobe: "+str(oobe)+", oobe for pure RF: "+str(self.oobe))
            self.oobe = oobe
    
    def predict(self, x):
        return super().predict(x)

In [32]:
def prepare_datasets():
    data = dict()
    #x, y = read_binary_pendigits("datasets/pendigits")
    #data["pendigits"] = ([x, y])
    
    path = os.path.join('datasets', '01_banknote_authentication.txt')
    df = pd.read_csv(path, sep=',', header=None)
    x = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    data["banknote_authentication"] = ([x, y])
    
    le = LabelEncoder()
    path = os.path.join('datasets', '08_magic_gamma_telescope.data')
    df = pd.read_csv(path, header=None)
    #x = df.values[:, :-1]
    #y = le.fit_transform(df.values[:, -1])
    #data["magic_telescope"] = ([x, y])
    
    #x = np.load('datasets/MNIST/one_vs_one_data.npy')
    #x = x.reshape(x.shape[0], -1)
    #y = np.load('datasets/MNIST/one_vs_one_targets.npy')
    #data["mnist"] = ([x,y])
    return data

#main
partitions = [10]
data = prepare_datasets()

params = [[0.2, 0.1, 1.1]] # T0, alpha, c0

start_time = time.time()
best_params = []
best_acc = 0
for k,v in data.items():
    print("\nFor dataset '"+k+"':\n")
    for p in partitions:
        for param in params:
            acc = 0
            print("Params: ", param)
            for random_state in range(0, 100, 10):
                x_l_train, y_l_train, x_u_train, x_test, y_test = split_data(v[0], v[1], random_state, p)
                rf = SemiSupervisedRandomForest()
                rf.train(x_l_train, y_l_train, x_u_train, random_state, param[0], param[1], param[2])
                y_predicted = rf.predict(x_test)
                a = accuracy_score(y_test, y_predicted)
                acc += a
            acc /= 10
            print("Accuracy(with labeled_size = " + str(p) + "): " + str(acc)+"\n")
    
print("--- %s seconds ---" % (time.time() - start_time))


For dataset 'banknote_authentication':

Params:  [0.2, 0.1, 1.1]
[D]Semi-supervised approach was discarded with oobe: 0.2878095238095237, oobe for pure RF: 0.18666666666666668
[A]Semi-supervised approach was accepted with oobe: 0.16, oobe for pure RF: 0.3441666666666666
[D]Semi-supervised approach was discarded with oobe: 0.3449285714285714, oobe for pure RF: 0.25983333333333325
[D]Semi-supervised approach was discarded with oobe: 0.17932142857142852, oobe for pure RF: 0.12200000000000001
[A]Semi-supervised approach was accepted with oobe: 0.16521428571428573, oobe for pure RF: 0.4820000000000001
[D]Semi-supervised approach was discarded with oobe: 0.20238095238095238, oobe for pure RF: 0.20233333333333328
[D]Semi-supervised approach was discarded with oobe: 0.2849285714285713, oobe for pure RF: 0.12200000000000001
[D]Semi-supervised approach was discarded with oobe: 0.461809523809524, oobe for pure RF: 0.19349999999999995
[D]Semi-supervised approach was discarded with oobe: 0.3866111

In [None]:
def prepare_datasets():
    data = dict()
    
    x, y = read_binary_pendigits("datasets/pendigits")
    data["pendigits"] = ([x, y])
    
    path = os.path.join('datasets', '01_banknote_authentication.txt')
    df = pd.read_csv(path, sep=',', header=None)
    x = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    data["banknote_authentication"] = ([x, y])
    
    le = LabelEncoder()
    path = os.path.join('datasets', '08_magic_gamma_telescope.data')
    df = pd.read_csv(path, header=None)
    x= df.values[0:2000, :-1]
    y = le.fit_transform(df.values[0:2000, -1])
    print(y)
    data["magic_telescope"] = ([x, y])
    return data

#main
partitions = [10]
data = prepare_datasets()

start_time = time.time()
for k,v in data.items():
    print("\nFor dataset '"+k+"':\n")
    for p in partitions:
        acc = 0
        for random_state in range(0, 100, 10):
            x_l_train, y_l_train, x_u_train, x_test, y_test = split_data(v[0], v[1], random_state, p)
            rf = RandomForest()
            rf.train(x_l_train, y_l_train, random_state)
            y_predicted = rf.predict(x_test)
            acc += accuracy_score(y_test, y_predicted)
        acc /= 10
        print("Accuracy(with labeled_size = " + str(p) + "): " + str(acc))
print("--- %s seconds ---" % (time.time() - start_time))


In [120]:
def prepare_datasets():
    data = dict()
    
    x, y = read_binary_pendigits("datasets/pendigits")
    data["pendigits"] = ([x, y])
    print(len(y))
    path = os.path.join('datasets', '01_banknote_authentication.txt')
    df = pd.read_csv(path, sep=',', header=None)
    x = df.values[:2000, :-1]
    y = df.values[:2000, -1].astype(int)
    data["banknote_authentication"] = ([x, y])
    
    le = LabelEncoder()
    path = os.path.join('datasets', '08_magic_gamma_telescope.data')
    df = pd.read_csv(path, header=None)
    x= df.values[:, :-1]
    y = le.fit_transform(df.values[:, -1])
    data["magic_telescope"] = ([x, y])
    return data

#main
partitions = [10, 30, 50]
data = prepare_datasets()

for k,v in data.items():
    print("\nFor dataset '"+k+"':\n")
    for p in partitions:
        acc = 0
        for random_state in range(0, 100, 10):
            x_l_train, y_l_train, x_u_train, x_test, y_test = split_data(v[0], v[1], random_state, p)
            rf = RandomForestClassifier(n_estimators=100, random_state=random_state).fit(x_l_train, y_l_train)
            y_predicted = rf.predict(x_test)
            acc += accuracy_score(y_test, y_predicted)
        acc /= 10
        print("Accuracy(with labeled_size = " + str(p) + "): " + str(acc))

2199

For dataset 'pendigits':

Accuracy(with labeled_size = 10): 0.8834090909090909
Accuracy(with labeled_size = 30): 0.9690909090909091
Accuracy(with labeled_size = 50): 0.9752272727272727

For dataset 'banknote_authentication':

Accuracy(with labeled_size = 10): 0.750909090909091
Accuracy(with labeled_size = 30): 0.8789090909090909
Accuracy(with labeled_size = 50): 0.9021818181818183

For dataset 'magic_telescope':

Accuracy(with labeled_size = 10): 0.651472134595163
Accuracy(with labeled_size = 30): 0.7172712933753944
Accuracy(with labeled_size = 50): 0.7570977917981072
