In [2]:
import numpy as np
import copy

In [7]:
d = np.array([[0.2,0.3,0.4],
              [0.1,0.1,0.1]])

e  = np.array([1,2,4])
c = -3
np.append(e, c)

array([ 1,  2,  4, -3])

In [9]:
T_class = SomeClass(n=10)

T_ensemble = [copy.deepcopy(T_class) for i in range(10)]

In [37]:
def _generate_sample_indices(seed, n_estimators, n_population, n_samples, bootstrap=True):
    """
    Generate the Bootstrapped samples indices

    Parameters
    ----------
    seed : int
        The random seed

    n_estimators : int
        The number of bootstrapped samples were to generate

    n_population : int
        The number of maximum samples available

    n_samples : int
        The number of samples to generate in each bootstrapped samples

    bootstrap : bool, default=True
        The bootstrap condition
        If `True`, you do the sampling WITH REPLACEMENT
        Else, you do the sampling WITHOUT REPLACEMENT

    Returns
    -------
    sample_indices : {array-like} of shape (n_estimators, n_samples)
        The bootstrapped sample indices
    """
    # Get the seed
    np.random.seed(seed)

    # Get the bagging indices
    sample_indices = np.random.choice(n_population,
                                      size = (n_estimators, n_samples),
                                      replace = bootstrap)
    
    return sample_indices

def _generate_feature_indices(seed, n_estimators, n_population, n_features, bootstrap=False):
    """
    Generate the Bootstrapped samples indices

    Parameters
    ----------
    seed : int
        The random seed

    n_estimators : int
        The number of bootstrapped samples were to generate

    n_samples : int
        The number of samples to generate in each bootstrapped samples

    bootstrap : bool, default=False
        The bootstrap condition
        If `True`, you do the sampling WITH REPLACEMENT
        Else, you do the sampling WITHOUT REPLACEMENT

    Returns
    -------
    feature_indices : {array-like} of shape (n_estimators, n_feature)
        The bootstrapped sample indices
    """
    # Get the seed
    np.random.seed(seed)

    # Get the bagging indices
    feature_indices = np.empty((n_estimators, n_features), dtype="int")
    for i in range(n_estimators):
        feature_indices[i] = np.random.choice(n_population, 
                                              n_features, 
                                              replace=bootstrap)
        feature_indices[i].sort()

    return feature_indices

In [39]:
_generate_feature_indices(seed=666, n_estimators=3, n_population=5, n_features=3, bootstrap=False)

array([[0, 1, 3],
       [0, 3, 4],
       [0, 1, 4]])

In [47]:
np.random.choice(4, size = 4, replace=False)

array([3, 2, 1, 0])

In [56]:
def Log_Loss(y : np.array) -> float:
    """Calculate impurity of a node using Log Loss

    Args:
        y (np.array) (n,):
            label data in a node

    Returns:
        float: the impurity of the node
    """
    # Extract class and count of each class
    num_data = len(y)
    class_, counts = np.unique(y, return_counts=True)
    class_counts = dict(zip(class_, counts))
    
    # Calculate the proportion every class in a node
    p_class = {k : class_counts[k]/num_data for k in class_}

    # Find the majority class in the node
    ind_max = np.argmax(counts)
    class_max = class_[ind_max]

    # Calculate the node impurity
    node_impurity = 1 - p_class[class_max]

    return node_impurity

In [64]:
def Entropy(y : np.array) -> float:
    """Calculate impurity of a node using Entropy

    Args:
        y (np.array) (n,):
            label data in a node

    Returns:
        float: the impurity of the node
    """
    # Extract class and count of each class
    num_data = len(y)
    class_, counts = np.unique(y, return_counts=True)
    class_counts = dict(zip(class_, counts))
    
    # Calculate the proportion every class in a node
    p_class = {k : class_counts[k]/num_data for k in class_}

    # Calculate the node impurity
    node_impurity = np.sum([p*np.log(p) for p in p_class.values()])

    return -node_impurity

In [69]:
# REGRESSION IMPURITY
def MSE(y : np.array) -> float:
    """Calculate impurity of a node using MSE

    Args:
        y (np.array) (n,):
            target data in a node

    Returns:
        float: the impurity of the node
    """
    # Calculate the mean of the node
    node_mean = np.mean(y)

    # Calculate the node-impurity (variance)
    node_impurity = np.mean([(y_i - node_mean)**2 for y_i in y])

    return node_impurity

In [71]:
def MAE(y : np.array) -> float:
    """Calculate impurity of a node using MAE

    Args:
        y (np.array) (n,):
            target data in a node

    Returns:
        float: the impurity of the node
    """
    # Calculate the node median
    node_median = np.median(y)

    # Calculate the node-impurity (variance)
    node_impurity = np.mean([np.abs(y_i - node_median) for y_i in y])

    return node_impurity

In [72]:
y = np.array([1,1,1,1,1,1,1,0,0,0,2,2,2,2,2])
print(Gini(y))
print(Log_Loss(y))
print(Entropy(y))
print(MSE(y))
print(MAE(y))

0.6311111111111112
0.5333333333333333
1.0437570363314084
0.5155555555555555
0.5333333333333333


In [3]:
def _split_data(data, feature, threshold):
    """
    Split data based on given feature and threshold
    
    Parameters
    ----------
    data : {array-like}, shape of (n_samples, n_features+1)
        sample data X, y

    feature: str
        feature to split

    threshold: float
        threshold to split the data
        if data[feature] > threshold
            return data_right
        else:
            return data_left

    Returns
    -------
    data_left: {array-like}, shape of (n_samples_1, n_features+1)
        X, y data that its X[feature] <= threshold

    data_right: {array-like}, shape of (n_samples_2, n_features+1)
        X, y data that its X[feature] > threshold
    """
    cond_left = data[:, feature] <= threshold
    data_left = data[cond_left]
    data_right = data[~cond_left]

    return data_left, data_right

In [13]:
data = np.array([[1,2,3,4],
                 [1,2,2,2],
                 [3,3,3,3],
                 [4,4,4,4],
                 [1,2,4,3]])
cond_left = data[:,1] <= 2
data_left = data[cond_left]
data_right = data[~cond_left]
data_right

array([[3, 3, 3, 3],
       [4, 4, 4, 4]])

In [14]:
def _generate_possible_split(data):
    """
    Generate possible split threshold
    """
    # Copy data
    data = data.copy()

    # Extract the unique value
    unique_val = np.unique(data)

    # Extract shape of unique_val
    m = len(unique_val)

    # Sort data
    unique_val.sort()

    # Initialize threshold
    threshold = np.zeros(m-1)

    # Create the possible split
    for i in range(m-1):
        val_1 = unique_val[i]
        val_2 = unique_val[i+1]

        threshold[i] = 0.5*(val_1 + val_2)

    return threshold

In [15]:
_generate_possible_split(data)

array([1.5, 2.5, 3.5])

In [2]:
import numpy as np
y = np.array([10, 3, 8])
X = np.array([[3,3],
              [1,1],
              [2,2]])

w, b = gradient_descent(X, y, 1e-2, 10000, fit_intercept=True)

Cost at iteration 0 is : 23.304953703703703
Cost at iteration 1000 is : 0.252519828805698
Cost at iteration 2000 is : 0.2501842947593061
Cost at iteration 3000 is : 0.25001347891500847
Cost at iteration 4000 is : 0.25000098581831875
Cost at iteration 5000 is : 0.2500000721005926
Cost at iteration 6000 is : 0.2500000052732794
Cost at iteration 7000 is : 0.2500000003856764
Cost at iteration 8000 is : 0.2500000000282077
Cost at iteration 9000 is : 0.25000000000206307


In [3]:
X @ w + b

array([10.49999956,  3.50000082,  7.00000019])

In [1]:
import numpy as np
def gradient_descent(X, y, alpha, num_iters, fit_intercept=True): 

    # initialize gain and intercept
    w = np.zeros(X.shape[1])
    b = np.zeros(1)
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        err = X@w + b - y

        dj_dw = (1/len(X))*np.sum(X*err.reshape(-1,1), axis=0)
        w = w - alpha * dj_dw

        if fit_intercept == True:
            dj_db = (1/len(X))*np.sum(err)
            b = b - alpha * dj_db
      
        if i%(num_iters/10) == 0:
            # print cost function
            cost = (1/(2*len(X)))*np.sum((X@w + b - y)**2)
            print(f"Cost at iteration {i} is : {cost}")
        
    return w, b

In [6]:
a = np.array([1,2,3,4,5])
b = np.array([1])

np.append(a,b)

array([1, 2, 3, 4, 5, 1])

In [64]:
w, b, _ = gradient_descent(X, y, w, b, compute_cost, compute_derivative, 1e-1, 10000)

Iteration    0: Cost     0.25   
Iteration 1000: Cost     0.25   
Iteration 2000: Cost     0.25   
Iteration 3000: Cost     0.25   
Iteration 4000: Cost     0.25   
Iteration 5000: Cost     0.25   
Iteration 6000: Cost     0.25   
Iteration 7000: Cost     0.25   
Iteration 8000: Cost     0.25   
Iteration 9000: Cost     0.25   


In [10]:
A = np.array([4,7,2])
B = np.array([3,1,4])

np.linalg.norm((A-B))

6.4031242374328485

array([[3, 3],
       [1, 1],
       [2, 2]])

In [38]:
X*err.reshape(-1,1)

array([[12, 12],
       [ 5,  5],
       [ 6,  6]])

In [40]:
np.sum(X*err.reshape(-1,1), axis=0)/len(X)

array([7.66666667, 7.66666667])

In [42]:
w - np.sum(X*err.reshape(-1,1), axis=0)/len(X)

array([-6.66666667, -5.66666667])

In [40]:
w.T

array([1, 2])

In [41]:
X.T

array([[3, 1, 2],
       [3, 1, 2]])

In [43]:
w.T@X.T

array([9, 3, 6])

In [52]:
for y in y_target:
    print(np.unique(y, return_counts=True)[1]/np.sum(np.unique(y, return_counts=True)[1]))

[0.66666667 0.33333333]
[1.]


In [21]:
import numpy as np

X = np.array([[1,2,3],[2,3,4],[4,5,6]])
X_inv = np.linalg.inv(X)

print("=====================")
print(X_inv)

print("=====================")
X @ X_inv

[[-1.20095990e+16  1.80143985e+16 -6.00479950e+15]
 [ 2.40191980e+16 -3.60287970e+16  1.20095990e+16]
 [-1.20095990e+16  1.80143985e+16 -6.00479950e+15]]


array([[ 2.,  0., -2.],
       [ 8.,  0.,  0.],
       [ 4.,  0.,  4.]])

In [28]:
# Import required package
import numpy as np
 
# Taking a 3 * 3 matrix
A = np.array([[1, 2, 3],
              [2, 3, 4],
              [4, 5, 6]])
 
# Print matrix identity
print(A @ np.linalg.inv(A))

[[ 2.  0. -2.]
 [ 8.  0.  0.]
 [ 4.  0.  4.]]
