In [7]:
import numpy as np

# Notes

## Association Mining

$support=\frac{X}{\lvert T \rvert}$

Support for an $item > 40\%$ means $0.4 \times \lvert T \rvert$ must be satisfied.

$confidence=\frac{support(X\bigcup Y)}{support(X)}$

## Naïve Bayes Classifier

$p\left(y \mid x_{1}, x_{2}, \ldots, x_{M}\right)=\frac{p\left(x_{1}, x_{2}, \ldots, x_{M} \mid y\right) p(y)}{\sum_{k=0}^{K-1} p\left(x_{1}, x_{2}, \ldots, x_{M} \mid y=k\right) p(y=k)}$

## AUC

$TPR / Recall / Sensitivity =\frac{\text { TP }}{\text { TP }+\text { FN }}$

$Specificity=\frac{\text { TN }}{\text { TN + FP }}$

FPR = $1 - Specificity=\frac{F P}{T N+F P}$

Total number of positive examples: $TP + FN$

Total number of negative examples: $FP + TN$

In [75]:
# compute AUC
from sklearn import metrics
y = np.array([0, 0, 0, 0, 1, 1, 1, 0, 0, 1])
pred = np.array([5.7, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.9, 7.0, 7.4])
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7500000000000001

## Similarity

In [19]:
r = np.array([[1,1,1,1,0,1]])
s = np.array([[1,1,0,1,0,0]])

def jaccard(x,y):
  x = np.asarray(x, np.bool) # Not necessary, if you keep your data
  y = np.asarray(y, np.bool) # in a boolean array already!
  return np.double(np.bitwise_and(x, y).sum()) / x.size - np.double(np.bitwise_or(x, y).sum())

def smc(x,y):
  return np.double(np.bitwise_and(x,y).sum() + np.bitwise_not(x,y).sum()) / x.size


from sklearn.metrics.pairwise import cosine_similarity

print(f"SMC: {smc(r,s)}")
print(f"Jaccard: {jaccard(r,s)}")
print(f"CosineSimilarity: {cosine_similarity(r,s)}")

SMC: -1.3333333333333333
Jaccard: -5.166666666666667
CosineSimilarity: [[-0.97590007]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.asarray(x, np.bool) # Not necessary, if you keep your data
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.asarray(y, np.bool) # in a boolean array already!



## k-means

In [47]:
from sklearn.cluster import KMeans
x = np.array([[5.7, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.9, 7.0, 7.4]]).T

kmeans = KMeans(n_clusters=2).fit(x)
#kmeans = KMeans(n_clusters=2, init=np.array([[x.mean]]).T).fit(x)

kmeans.cluster_centers_, kmeans.labels_

(array([[6.92],
        [6.12]]),
 array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0], dtype=int32))

## KNN

In [10]:
# find k-nearest neighbours
from sklearn.neighbors import NearestNeighbors
samples = np.array([[5.7, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.9, 7.0, 7.4]]).T
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(samples)
neigh.kneighbors([[7.4]])

# compute densities and average relative densities (modify K and arrays)
K = 3
(d1, d2, d3) = (1/K * np.array([0.1,0.3,0.4])).sum()**-1, (1/K*np.array([0.1,0.2,0.3])).sum()**-1, (1/K*np.array([0.1,0.2,0.3])).sum()**-1
di = (1/K * np.array([0.4, 0.5, 0.7])).sum()**-1
ard = di/(1/K*np.array([d1,d2,d3])).sum()
ard

<bound method BaseEstimator.get_params of NearestNeighbors(n_neighbors=3)>

## PCA

In [None]:
# variance explained
def variance_explained(S, S_sub):
    var_expl_pca = (S_sub**2).sum() / sum(S**2)
    print("$Var_{Expl} Range$: ", var_expl_pca)

S = np.array([14.14, 11.41, 9.46, 4.19, 0.17]) # s_i

# modify range
S_sub = S[:3]
print(f"Selected values: {S_sub}")
variance_explained(S, S_sub)

## Decision Trees

**Impurity Gain**
$\Delta=I(r)-\sum_{k=1}^{K} \frac{N\left(v_{k}\right)}{N(r)} I\left(v_{k}\right)$

Impurity Gain Example
N = 54

height: {low: 18, medium: 18, high: 18}
high performance (height low): {low: 6, medium. 9, high: 3}
high performance (height medium): {low: 4, medium: 6, high: 10}
high performance (height high): {low: 8, medium: 3, high: 5}




In [32]:
I_r = 54
I_1 = np.array([6/18, 9/18, 3/18])**2
I_1 = np.array([4/18, 6/18, 10/18])**2
I_1 = np.array([8/18, 3/18, 5/18])**2


AttributeError: 'float' object has no attribute 'sum'

In [66]:

(1/2 * np.array([68.1,111.1])).sum()**-1

0.011160714285714286

(0.4090909090909091, 1.875)

## Adaboost

error rate
$\epsilon_{t}=\sum_{i=1}^{N} w_{i}(t)\left(1-\delta_{f_{t}\left(\boldsymbol{x}_{i}\right), y_{i}}\right)$ where $\delta_{a, b}=\left\{\begin{array}{ll}1 & \text { if } a=b \\ 0 & \text { if } a \neq b\end{array}\right.$

importance of classifier
$\alpha_{t}=\frac{1}{2} \log \frac{1-\epsilon_{t}}{\epsilon_{t}}$

## Decision Boundaries
- Linear boundaries: Multinomial Regression
- Smooth curvy boundaries: ANN
- Axis aligned boundaries: Classification Tree
- Complex non-smooth boundaries: 3-nearest neighbours

# Scratchbook

In [25]:
c1 = np.array([-2.1, -1.7, -1.5])
c2 = np.array([-0.4, 0.0, 0.6])
c3 = np.array([0.8, 1.0, 1.1])

np.append(np.append(c2, c3), -1.5).mean()

0.2285714285714286

In [24]:
(1*0.5+0.5*1)*0.25, (1*0.5-0.5*1)*-0.25, (1*-0.5+0.25)*0.25

(0.25, -0.0, -0.0625)

In [18]:
forward
4

backward
1,2,3,4
1,3,4


211.70000000000002