In [2]:
import numpy as np

# Notes

## Association Mining

$support=\frac{X}{\lvert T \rvert}$

Support for an $item > 40\%$ means $0.4 \times \lvert T \rvert$ must be satisfied.

$confidence=\frac{support(X\bigcup Y)}{support(X)}$

## Naïve Bayes Classifier

$p\left(y \mid x_{1}, x_{2}, \ldots, x_{M}\right)=\frac{p\left(x_{1}, x_{2}, \ldots, x_{M} \mid y\right) p(y)}{\sum_{k=0}^{K-1} p\left(x_{1}, x_{2}, \ldots, x_{M} \mid y=k\right) p(y=k)}$

## AUC

$TPR / Recall / Sensitivity =\frac{\text { TP }}{\text { TP }+\text { FN }}$

$Specificity=\frac{\text { TN }}{\text { TN + FP }}$

FPR = $1 - Specificity=\frac{F P}{T N+F P}$


## Similarity

In [19]:
r = np.array([[1,1,1,1,0,1]])
s = np.array([[1,1,0,1,0,0]])

def jaccard(x,y):
  x = np.asarray(x, np.bool) # Not necessary, if you keep your data
  y = np.asarray(y, np.bool) # in a boolean array already!
  return np.double(np.bitwise_and(x, y).sum()) / x.size - np.double(np.bitwise_or(x, y).sum())

def smc(x,y):
  return np.double(np.bitwise_and(x,y).sum() + np.bitwise_not(x,y).sum()) / x.size


from sklearn.metrics.pairwise import cosine_similarity

print(f"SMC: {smc(r,s)}")
print(f"Jaccard: {jaccard(r,s)}")
print(f"CosineSimilarity: {cosine_similarity(r,s)}")

SMC: -1.3333333333333333
Jaccard: -5.166666666666667
CosineSimilarity: [[-0.97590007]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.asarray(x, np.bool) # Not necessary, if you keep your data
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.asarray(y, np.bool) # in a boolean array already!



## k-means

In [None]:
from sklearn.cluster import KMeans
x = np.array([[1.0, 1.2, 1.8, 2.3, 2.6, 3.4, 4.0, 4.1, 4.2]]).T

kmeans = KMeans(n_clusters=3, init=np.array([[1.8,3.3,3.6]]).T).fit(x)

kmeans.cluster_centers_

## PCA

In [None]:
# variance explained
def variance_explained(S, S_sub):
    var_expl_pca = (S_sub**2).sum() / sum(S**2)
    print("$Var_{Expl} Range$: ", var_expl_pca)

S = np.array([14.14, 11.41, 9.46, 4.19, 0.17]) # s_i

# modify range
S_sub = S[:3]
print(f"Selected values: {S_sub}")
variance_explained(S, S_sub)

## Adaboost

error rate
$\epsilon_{t}=\sum_{i=1}^{N} w_{i}(t)\left(1-\delta_{f_{t}\left(\boldsymbol{x}_{i}\right), y_{i}}\right)$ where $\delta_{a, b}=\left\{\begin{array}{ll}1 & \text { if } a=b \\ 0 & \text { if } a \neq b\end{array}\right.$

importance of classifier
$\alpha_{t}=\frac{1}{2} \log \frac{1-\epsilon_{t}}{\epsilon_{t}}$

## Decision Boundaries
- Linear boundaries: Multinomial Regression
- Smooth curvy boundaries: ANN
- Axis aligned boundaries: Classification Tree
- Complex non-smooth boundaries: 3-nearest neighbours

# Scratchbook

In [25]:
c1 = np.array([-2.1, -1.7, -1.5])
c2 = np.array([-0.4, 0.0, 0.6])
c3 = np.array([0.8, 1.0, 1.1])

np.append(np.append(c2, c3), -1.5).mean()

0.2285714285714286

In [24]:
(1*0.5+0.5*1)*0.25, (1*0.5-0.5*1)*-0.25, (1*-0.5+0.25)*0.25

(0.25, -0.0, -0.0625)