### Library import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Make/Load dataset

In [None]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=500, n_features=10, n_informative=4,
                             n_clusters_per_class=2, random_state=14)

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

### Standardize/Normallize Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Split Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Insert interception

In [None]:
intercept = np.ones((X_train.shape[0], 1))
X_train = np.concatenate((intercept, X_train), axis=1)

### Classification report

#### From scratch

In [None]:
class classification_report_fromSratch:
    def __init__(self, y_actual, y_predict):
        self.y_actual = y_actual
        self.y_predict = y_predict
        self.TP = sum((self.y_actual == 1) & (self.y_predict == 1))
        self.FN = sum((self.y_actual == 1) & (self.y_predict == 0))
        self.FP = sum((self.y_actual == 0) & (self.y_predict == 1))
        self.TN = sum((self.y_actual == 0) & (self.y_predict == 0))

    def accuracy(self):
        return (self.TP + self.TN)/(self.TP + self.TN + self.FP + self.FN)
        
    def precision(self):
        return (self.TP)/(self.TP + self.FP)

    def recall(self):
        return (self.TP)/(self.TP + self.FN)

    def f1(self):
        return (2 * self.precision() * self.recall())/(self.precision() + self.recall())

#### sklearn

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, yhat=None))

#### Confusion matrix

In [None]:
# from Lab03-02-NBM

from sklearn.metrics import confusion_matrix
import seaborn as sns

mat = confusion_matrix(y_test, yhat)

sns.heatmap(mat.T, annot=True, fmt="d",
           xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel('true')
plt.ylabel('predicted')

## 1. Linear regression [Regression]

In [None]:
from time import time

class LinearRegression:
    def __init__(self, method="batch", max_iter=10000, 
            tol=0.001, alpha=0.0001, batch_size=10):
        self.method = method
        self.max_iter = max_iter
        self.tol = tol
        self.alpha = alpha
        self.batch_size = batch_size

    def fit(self, X_train, y_train):
        assert len(X_train)  == len(y_train)
        assert len(X_test) == len(y_test)
        loss_old = 10000
        self.iter_stop = 0
        self.theta = np.zeros(X_train.shape[1])
        start = time()
        for i in range(self.max_iter):
            if self.method == "batch":
                self.X_train = X_train
                self.y_train = y_train
            elif self.method == "sto":
                idx = np.random.randint(0, X.shape[0])
                while idx in list_of_used_ix:
                    idx = np.random.randint(X_train.shape[0])
                self.X_train = X[ix, :].reshape(1, -1)
                self.y_train = y[ix]
                list_of_used_ix.append(idx)
                if len(list_of_used_ix) == X_train.shape[0]: list_of_used_ix = []
            elif self.method == "mini":
                ix = np.random.randint(0, X.shape[0])
                self.X_train = X[ix:ix+self.batch_size]
                self.y_train = y[ix:ix+self.batch_size]
            else:
                print("method is not correct")
                break
            
            yhat = self.hx(self.X_train, self.theta)
            error = yhat - self.y_train
            grad = self.gradient(self.X_train, error)

            if i>0 and i<4:
                pass
                #print(yhat.shape, self.y_train.shape)

            self.theta = self.theta - self.alpha * grad

            loss_new = self.MSE(yhat, self.y_train)
            diff = abs(loss_new - loss_old)
    
            self.iter_stop = i+1
            if diff < self.tol:
                break
            else:
                loss_old = loss_new
        self.time_taken = time() - start

    def evalute(self, X_test, y_test):
        yhat_test = self.hx(X_test, self.theta)
        mse = self.MSE(yhat_test, y_test)
        return mse

    def iter_stop(self):
        return self.iter_stop

    def time_taken(self):
        return self.time_take

    def hx(self, X, theta):
        return X @ theta

    def MSE(self, yhat, y):
        return (((yhat - y)**2).sum()) / yhat.shape[0]

    def gradient(self, X, error):
        return X.T @ error

## 2. Logistic regression [Classification]

### Binary

In [None]:
# Class Logistic Regression
class LogisticRegression:
    def __init__(self, method="minibatch", l_rate=0.01, 
                    batch_percent=10, max_iter=1000):
        self.method = method
        self.l_rate = l_rate
        self.batch_percent = batch_percent
        self.max_iter = max_iter

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        batch_size = int(self.batch_percent/100 * X.shape[0])
        self.loss = []
        list_of_used_ix = []     
        for i in range(self.max_iter):
            if self.method == "minibatch":
                ix = np.random.randint(0, X.shape[0])
                batch_X = X[ix:ix+batch_size]
                batch_y = y[ix:ix+batch_size]
            elif self.method == "sto":
                idx = np.random.randint(0, X.shape[0])
                while idx in list_of_used_ix:
                    idx = np.random.randint(X_train.shape[0])
                batch_X = X[ix, :].reshape(1, -1)
                batch_y = y[ix]
                list_of_used_ix.append(idx)
                if len(list_of_used_ix) == X_train.shape[0]: list_of_used_ix = []
            elif self.method == "batch":
                batch_X = X
                batch_y = y
            else:
                print("Method is not match")
            cost, grad = self.gradient(batch_X, batch_y)
            self.loss.append(cost)
            self.w = self.w - self.l_rate * grad
        self.iter = i+1
        self.yhat = self.y_predict(X_test)

    def sigmoid(self, x):        
        return 1 / (1 + np.exp(-x))

    def gradient(self, X, y):
        h = self.h_theta(X, self.w)
        error = h - y
        # putting negative sign for negative log likelihood
        cost = - np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
        grad = np.dot(X.T, error)
        return cost, grad

    def h_theta(self, X, w):
        return self.sigmoid(X @ w)

    def plot_loss(self):
        x_axis = [*range(self.iter)]
        y_axis = self.loss
        plt.plot(x_axis, y_axis)
        plt.title("Losses - iteration")
        plt.xlabel("Iteration")
        plt.ylabel("Losses")

    def y_predict(self, X):
        return np.round(self.h_theta(X, self.w))




In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.plot_loss()

### Multinomial

In [None]:
# Encoding y
k = len(set(y))
m,n = X_train.shape
Y_train_encoded = np.zeros((m, k))
for each_class in range(k):
    cond = Y_train==each_class
    Y_train_encoded[np.where(cond), each_class] = 1

In [None]:
class LogisticRegression:
    def __init__(self, method="minibatch", max_iter=10000, l_rate=0.001, batch_size_ratio=0.1):
        if (method != "minibatch") & (method != "batch") & (method != "sto"):
            raise ValueError("Method is not match")
        else:
            self.method = method
            self.max_iter = max_iter
            self.l_rate = l_rate
            self.batch_size_ratio = batch_size_ratio

    def fit(self, X, Y):
        m = X.shape[0]
        n = X.shape[1]
        k = Y.shape[1]
        self.W = np.random.rand(n, k)
        batch_size = round(self.batch_size_ratio*m)
        self.losses = []
        list_of_used_ix = []
        start = time()
        for i in range(self.max_iter):
            if self.method == "minibatch":
                idx = np.random.randint(0, m-batch_size)
                X_batch = X[idx:idx+batch_size]
                Y_batch = Y[idx:idx+batch_size]
            elif self.method == "batch":
                X_batch = X
                Y_batch = Y
            elif self.method == "sto":
                idx = np.random.randint(X_train.shape[0])
                while idx in list_of_used_ix:
                    idx = np.random.randint(X_train.shape[0])
                X_batch = X[idx, :].reshape(1, -1)
                Y_batch = Y_train_encoded[idx]                
                list_of_used_ix.append(idx)
                if len(list_of_used_ix) == X_train.shape[0]:
                    list_of_used_ix = []
            cost, grad =  self.gradient(X_batch, Y_batch)
            self.losses.append(cost)
            self.W = self.W - self.l_rate * grad
        self.runtime = time()-start

    def gradient(self, X, Y):
        m = X.shape[0]
        h = self.h_theta(X, self.W)
        cost = - np.sum(Y * np.log(h)) / m
        error = h - Y
        grad = self.softmax_grad(X, error)
        return cost, grad

    def softmax_grad(self, X, error):
        return  X.T @ error
            
    def softmax(self, theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def h_theta(self, X, W):
        return self.softmax(X @ W)
    
    def predict(self, X):
        return np.argmax(self.h_theta(X, self.W), axis=1)

    def plot_losses(self):
        x_axis = [*range(len(self.losses))]
        y_axis = self.losses
        plt.plot(x_axis, y_axis)
        title = "Losses - iteration " + "("+self.method+")"
        plt.title(title)
        plt.xlabel("Iteration")
        plt.ylabel("Losses")

## 3. Naive Bayesian

### Naive Bayesian - Gaussian

In [None]:
class GaussianNaive:
    def fit(self, X, y):
        n = X.shape[1]
        self.k =len(np.unique(y))
        self.mean = np.zeros((self.k, n))
        self.std = np.zeros((self.k, n))
        m = np.zeros(self.k)
        for label in range(self.k):
            self.mean[label, :] = X[y==label].mean(axis=0)
            self.std[label, :]  = X[y==label].std(axis=0)
            m[label] = len(X[y==label])
        self.prior = m/sum(m)

    def gaussian_pdf(self, X, mean, std):
        left = 1 / (np.sqrt(2 * np.pi) * std)
        e = (X - mean) ** 2 / (2 * (std ** 2))
        right = np.exp(-e)
        return left*right

    def predict(self, X):
        posterior = np.zeros((X.shape[0], self.k))
        for label in range(self.k):
            likelihood = self.gaussian_pdf(X, self.mean[label,:], self.std[label,:])
            total_likelihood = np.prod(likelihood, axis=1)
            posterior[:,label] = self.prior[label]*total_likelihood
        yhat = np.argmax(posterior, axis=1)
        return yhat


### Naive Bayesian - Multinomial

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

class NBM:
    def __init__(self, vectorizer='count', laplace=1):
        self.laplace = laplace
        self.vectorizer = vectorizer

    def transform(self, X_train, X_test, method):
        if method != 'count' and method != 'Tfid':
            raise ValueError("Method is not 'count' or 'Tfid'")
        v = CountVectorizer()
        X_train = v.fit_transform(X_train)
        X_test = v.transform(X_test)
        if method == 'Tfid':
            print("Tfid")
            v = TfidfTransformer()
            X_train = v.fit_transform(X_train)
            X_test = v.transform(X_test)
        return X_train, X_test.toarray()

    def fit(self, X, y):
        # Shape        
        m, n = X.shape
        self.classes = np.unique(y)
        k = len(self.classes)
        # fit
        self.likelihoods = np.zeros((k,n))
        self.priors = np.zeros(k)
        for idx, label in enumerate(self.classes):
            X_classed = X[y==label]
            self.likelihoods[idx,:] = self.likelihood_fn(X_classed)
            self.priors[idx] = self.prior_fn(y, label)

    def likelihood_fn(self, X_class):
        dividend  = ((X_class.sum(axis=0)) + self.laplace)
        devider = (np.sum(X_class.sum(axis=0) + self.laplace))
        return  dividend/devider

    def prior_fn(self, y, label):
        return len(y[y==label])/len(y)

    def predict(self, X_test):
        yhat = np.log(self.priors) + X_test @ np.log(self.likelihoods.T)
        yhat = np.argmax(yhat, axis=1)
        return yhat

## 4. SVM

In [None]:
import cvxopt

# Kernel
def linear(x, z):
    return np.dot(x, z.T)

def polynomial(x, z, p=5):
    return (1 + np.dot(x, z.T)) ** p

def gaussian(x, z, sigma=0.9999):
    return np.exp(-np.linalg.norm(x - z, axis=1) ** 2 / (2 * (sigma ** 2)))

# SVM
class SVM:
    def __init__(self, kernel=gaussian, C=1):
        self.kernel = kernel
        self.C = C

    def fit(self, X, y):
        self.y = y
        self.X = X
        m, n = X.shape

        # Calculate Kernel
        self.K = np.zeros((m, m))
        for i in range(m):
            self.K[i, :] = self.kernel(X[i, np.newaxis], self.X)

        # Solve with cvxopt final QP needs to be reformulated
        # to match the input form for cvxopt.solvers.qp
        P = cvxopt.matrix(np.outer(y, y) * self.K)
        q = cvxopt.matrix(-np.ones((m, 1)))
        G = cvxopt.matrix(np.vstack((np.eye(m) * -1, np.eye(m))))
        h = cvxopt.matrix(np.hstack((np.zeros(m), np.ones(m) * self.C)))
        A = cvxopt.matrix(y, (1, m), "d")
        b = cvxopt.matrix(np.zeros(1))
        cvxopt.solvers.options["show_progress"] = False
        sol = cvxopt.solvers.qp(P, q, G, h, A, b)
        self.alphas = np.array(sol["x"])

    def predict(self, X):  #<----this is X_test
        y_predict = np.zeros((X.shape[0]))
        sv = self.get_parameters(self.alphas)

        for i in range(X.shape[0]):
            y_predict[i] = np.sum(
                self.alphas[sv]
                * self.y[sv, np.newaxis]
                * self.kernel(X[i], self.X[sv])[:, np.newaxis]
            )

        return np.sign(y_predict + self.b)

    def get_parameters(self, alphas):
        threshold = 1e-5

        sv = ((alphas > threshold) * (alphas < self.C)).flatten()
        self.w = np.dot(self.X[sv].T, alphas[sv] * self.y[sv, np.newaxis])
        self.b = np.mean(
            self.y[sv, np.newaxis]
            - self.alphas[sv] * self.y[sv, np.newaxis] * self.K[sv, sv][:, np.newaxis]
        )
        return sv

## 5. K-Nearest Neighbors

In [None]:
class KNN:
    def predict(self, X_train, X_test, y_train, k=3):
        classes = len(np.unique(y_train))
        neighbors_ix = self.find_neighbors(X_train, X_test, k)

        pred = np.zeros(X_test.shape[0])
        prob = np.zeros((X_test.shape[0]))
        for ix, y in enumerate(y_train[neighbors_ix]):
            freq = np.bincount(y)
            while len(freq) < classes:
                freq = np.append(freq, 0)
            k_inc = k
            while np.sort(freq)[-1] == np.sort(freq)[-2]:
                k_inc += 1
                neighbors_ix_new = self.find_neighbors(X_train, X_test[ix].reshape(1,-1), k_inc).reshape(-1)
                freq = np.bincount(y_train[neighbors_ix_new])
                while len(freq) < classes:
                    freq = np.append(freq, 0)
            pred[ix] = self.get_most_common(y)
            prob_all = freq/np.sum(freq)
            prob[ix] = prob_all[int(pred[ix])]
        return pred, prob

    def find_distance(self, X_train, X_test):
        #create newaxis simply so that broadcast to all values
        dist = X_test[:, np.newaxis, :] - X_train[np.newaxis, :, :]
        sq_dist = dist ** 2

        #sum across feature dimension, thus axis = 2
        summed_dist = sq_dist.sum(axis=2)
        sq_dist = np.sqrt(summed_dist)
        return sq_dist

    def find_neighbors(self, X_train, X_test, k=3):
        dist = self.find_distance(X_train, X_test)
        #return the first k neighbors
        neighbors_ix = np.argsort(dist)[:, 0:k]
        return neighbors_ix

    def get_most_common(self, y):
        return np.bincount(y).argmax()

    def CV_K(self, X_train_val, y_train_val, K_max=5, cv=3):
        # Split train data and validation data
        m, n = X_train_val.shape
        idx = list(range(m))
        idx_List = []
        for i in range(cv):
            idx_List.append(idx[i*int(m/cv):(i+1)*int(m/cv)])
        # Predict and find accuracy
        acc = []
        K = []
        for i in range(1, K_max+1):
            acc_sum = 0
            for idx in idx_List:
                X_val = X_train_val[idx]
                y_val = y_train_val[idx]
                X_train = np.delete(X_train_val,idx, axis=0)
                y_train = np.delete(y_train_val,idx, axis=0)
                yhat, yhat_prob = self.predict(X_train, X_val, y_train, k=i)
                acc_sum += np.sum(yhat == y_val)/len(y_val)
            acc.append(acc_sum/cv)
            K.append(i)
        return acc, K

In [None]:
model = KNN()
acc, K = model.CV_K(X_train, y_train, K_max=10, cv=5)
idx = np.argmax(acc)
print("Best K:", K[idx], "Accuracy:",acc[idx])
plt.plot(K, acc)
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.show

## 6. Decision Trees

## 7. Random Forest

## 8. AdaBoost

## 9. Gradient Boosting

## 10. K-Means

## 11. GMM