In [134]:
import numpy as np
import numba
from sklearn.cluster import KMeans
from sklearnex import patch_sklearn; patch_sklearn()
import scipy
import pandas as pd
from concurrent.futures import ProcessPoolExecutor as Executor
from plotly import graph_objects as go, subplots as sp

axis_col = 'rgba(0, 0, 0, 0.15)'
zero_col = 'rgba(0, 0, 0, 0.3)'
no_col = 'rgba(0, 0, 0, 0)'

xaxis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=zero_col, zerolinewidth=2)
yaxis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=zero_col, zerolinewidth=2)
layout = dict(
    autosize=True,
    width=1400,
    height=400,
    margin=dict(
        l=60, r=25, b=60, t=60, pad=5
    ),
    # paper_bgcolor="white",
    font_family="Times New Roman",
    font_color="black",
    font_size=20,
    plot_bgcolor='white',
    xaxis=dict(**xaxis_desc, ),
    yaxis=yaxis_desc,
)

from local.caching import load, save

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [135]:
ALPHAS = [2, 2, 2]
NU = 2
BETA = 2
NUM_CLUSTERS = len(ALPHAS)

_RAND_STATE = 0

np.random.seed(_RAND_STATE)
xs = pd.read_csv('pmm_q1.tsv', header=None).to_numpy()
alphas = np.array(ALPHAS)
alpha_sum = alphas.sum()

In [136]:
previous = False
retry=False
# retry = True
iterations = 50

save_file = f"em_{iterations}"
if not retry:
    try:
        pi, current_lambdas, log_lls = load(save_file)
        previous = True
    except FileNotFoundError:
        pass

if not previous or retry:

    # kmeans initialization
    kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=_RAND_STATE)
    kmeans.fit(xs)
    assignments = np.zeros(shape=(xs.shape[0], NUM_CLUSTERS))
    current_lambdas = np.zeros(shape=(NUM_CLUSTERS, ))
    pi = np.zeros(shape=(NUM_CLUSTERS, ))
    for k in range(NUM_CLUSTERS):
        group = np.array([x for a, x in zip(kmeans.labels_, xs) if a == k])
        current_lambdas[k] = group.mean()
        pi[k] = group.shape[0]/xs.shape[0]

    ll_last_component = np.zeros(shape=assignments.shape)
    log_lls = []
    for it in range(iterations):
        print(f"iteration: {it+1}", end='\r' if it<iterations-1 else '\n')
        xvec = xs[:, 0]

        # e-step
        gammas = []
        for k in range(NUM_CLUSTERS):
            gammas.append(scipy.stats.gamma.pdf(current_lambdas[k], NU))
            for i, x in enumerate(xvec):
                fx_theta = scipy.stats.poisson.pmf(x, current_lambdas[k])
                assignments[i, k] = pi[k]*fx_theta
                ll_last_component[i, k] = pi[k]*fx_theta

        assignments = (assignments.T/assignments.sum(axis=1)).T
        log_ll = np.log(scipy.stats.dirichlet.pdf(pi, alphas)) + np.log(gammas).sum() + np.log(ll_last_component.sum(axis=1)).sum()
        # print(scipy.stats.dirichlet.pdf(pi, alphas))
        log_lls.append(log_ll)

        # m-step
        for k in range(NUM_CLUSTERS):
            K, N = NUM_CLUSTERS, xs.shape[0]
            assignments_k_sum = assignments[:, k].sum()

            pi[k] = (alphas[k] - 1 + assignments_k_sum) / (alpha_sum - K + N)
            current_lambdas[k] = ((assignments[:, k]*xvec).sum()+1) / (assignments_k_sum+2)
    
    save(save_file, (pi, current_lambdas, log_lls))

print(f"π: {pi}")
print(f"λ: {current_lambdas.T}")

recovering & decompressing cached data from [{WORKSPACE}/hw/04/cache/em_50.pkl.gz]
π: [0.48646191 0.26333532 0.25020277]
λ: [ 0.73604373  5.6192733  12.48771063]


In [137]:
max_x = xs.max()
min_x = 0

fig = go.Figure()
cutt = None
fig.add_trace(
    go.Scatter(
        x = [x for x, y in enumerate(log_lls[:cutt])],
        y = [y for x, y in enumerate(log_lls[:cutt])],
        mode="lines+markers",
        marker=dict(size=3),
        line=dict(width=1),
        showlegend=False,
    ),
)

fig.update_annotations(font_size=24)
fig.update_layout(go.Layout(layout))

fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')

In [172]:
max_x = xs.max()
min_x = 0

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = pd.read_csv('pmm_q1.tsv', header=None).to_numpy().T[0],
        histnorm='probability',
        nbinsx=40,
        showlegend=False,
    )
)

for lmd in current_lambdas.T:
    distr = [(x, scipy.stats.poisson.pmf(x, lmd)) for x in np.arange(min_x, max_x+1, 1)]
    fig.add_trace(
        go.Scatter(
            x = [x for x, y in distr],
            y = [y for x, y in distr],
            mode="lines+markers",
            showlegend=False,
        ),
    )

fig.update_annotations(font_size=24)
fig.update_layout(go.Layout(**layout, 
    bargap=0.2,
))

fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')

In [17]:
retry=False
# retry = True
iterations = 50

save_file = f"gibbs_{iterations}"
if not retry:
    try:
        pis, lambdas = load(save_file)
        previous = True
    except FileNotFoundError:
        pass

if not previous or retry:

    Z_i = np.zeros(shape=(NUM_CLUSTERS, ))
    Z_b_temp = np.zeros(shape=(NUM_CLUSTERS, ))
    Z_sums = np.zeros(shape=(NUM_CLUSTERS, ))
    Z_counts = np.zeros(shape=(NUM_CLUSTERS, ))
    paramaters = np.zeros(shape=(NUM_CLUSTERS, ))
    lambdas = np.zeros(shape=(iterations+1, NUM_CLUSTERS))
    pis = np.zeros(shape=(iterations+1, NUM_CLUSTERS))

    # kmeans initialization
    kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=_RAND_STATE, n_init="auto")
    kmeans.fit(xs)
    for k in range(NUM_CLUSTERS):
        group = np.array([x for a, x in zip(kmeans.labels_, xs) if a == k])
        lambdas[0, k] = group.mean()
        pis[0, k] = group.shape[0]/xs.shape[0]

    for it in range(iterations):
        print(f"iteration: {it+1}", end='\r')

        for x in xs: # i in N
            for k in range(NUM_CLUSTERS):
                Z_i[k] = pis[it, k] * scipy.stats.poisson.pmf(x, lambdas[it, k])[0]
            Z_i /= Z_i.sum()

            theta = scipy.stats.multinomial(1, Z_i) # 1 trial, prob. success is Z
            z = [i for i, z in enumerate(theta.rvs(size=1)[0]) if z > 0][0]
            Z_sums[z] += x
            Z_counts[z] += 1

        for k in range(NUM_CLUSTERS):
            lambdas[it+1, k] = scipy.stats.gamma(NU + Z_sums[k]).rvs(size=1)[0]/(BETA + Z_counts[k])
            paramaters[k] = alphas[k] + Z_counts[k]
        pis[it+1] = scipy.stats.dirichlet.rvs(paramaters, size=1)[0]

    save(save_file, (pis, lambdas))

In [18]:
pis.shape

(3001, 3)

In [61]:
fig = sp.make_subplots(
    rows=6, cols=2, shared_xaxes=True, shared_yaxes=False, horizontal_spacing=0.05, vertical_spacing=0.02,
    # x_title = "Submission Date", y_title = "Assembly Counts",
)
burn_in = 1000
for k in range(NUM_CLUSTERS):
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(lambdas[:, k])],
            y=[y for x, y in enumerate(lambdas[:, k])],
            mode="lines",
            name=f"all λ{k+1}",
        ),
        row=k+1, col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(pis[:, k])],
            y=[y for x, y in enumerate(pis[:, k])],
            mode="lines",
            name=f"all π{k+1}",
        ),
        row=k+1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(lambdas[:, k]) if x>burn_in],
            y=[y for x, y in enumerate(lambdas[:, k]) if x>burn_in],
            mode="lines",
            name=f"λ{k+1}",
        ),
        row=3+k+1, col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(pis[:, k]) if x>burn_in],
            y=[y for x, y in enumerate(pis[:, k]) if x>burn_in],
            mode="lines",
            name=f"π{k+1}",
        ),
        row=3+k+1, col=2
    )

fig.update_annotations(font_size=24)
_layout = layout.copy()
_layout["height"] = 600
for i in range(12):
    i += 1
    _layout[f"xaxis{i}"] = xaxis_desc
    _layout[f"yaxis{i}"] = yaxis_desc
fig.update_layout(go.Layout(_layout))
fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')

In [68]:
fig = sp.make_subplots(
    rows=3, cols=2, shared_xaxes=True, shared_yaxes=False, horizontal_spacing=0.05, vertical_spacing=0.02,
    # x_title = "Submission Date", y_title = "Assembly Counts",
)
def running_average(vec:np.ndarray):
    rav = np.zeros(shape=vec.shape)
    for i in range(len(vec)):
        rav[i] = vec[:i+1].sum()/(i+1)
    return rav

for k in range(NUM_CLUSTERS):
    ravl = running_average(lambdas[:, k])
    ravp = running_average(pis[:, k])
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(ravl)],
            y=[y for x, y in enumerate(ravl)],
            mode="lines",
            name=f"rav λ{k+1}",
        ),
        row=k+1, col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=[x for x, y in enumerate(ravp)],
            y=[y for x, y in enumerate(ravp)],
            mode="lines",
            name=f"rav π{k+1}",
        ),
        row=k+1, col=2
    )

fig.update_annotations(font_size=24)
_layout = layout.copy()
_layout["height"] = 600
for i in range(12):
    i += 1
    _layout[f"xaxis{i}"] = xaxis_desc
    _layout[f"yaxis{i}"] = yaxis_desc
fig.update_layout(go.Layout(_layout))
fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')

In [85]:
np.arange(0, 2, 0.5)

array([0. , 0.5, 1. , 1.5])

In [104]:
scipy.stats.poisson.pmf(0.2, 0.5)

0.0

In [127]:
pi = np.array((0.5, 0.5))
given_lambdas = np.array((0.6, 1))
xs = pd.read_csv('./pmm_q3.tsv', header=None).to_numpy()
NUM_CLUSTERS = len(pi)

min_v = 0.5
max_v = 6
step = 0.25
ll_last_component = np.zeros(shape=(len(xs), NUM_CLUSTERS))

fig = go.Figure()
points = np.arange(min_v, max_v+step, step)
z = np.zeros(shape=(len(points), len(points)))
for i, x in enumerate(points):
    for j, y in enumerate(points):

        for k, lmd in enumerate([x, y]):
            for i2, _x in enumerate(xs):
                fx_theta = scipy.stats.poisson.pmf(_x, lmd)
                ll_last_component[i2, k] = pi[k]*fx_theta

        z[i, j] = np.log(ll_last_component.sum(axis=1)).sum()

for lmd in given_lambdas:
    fig.add_trace(
        go.Contour(
            z = z,
            showlegend=False,
            x=points,
            y=points,
            colorscale='Hot',
            contours=dict(
                coloring ='heatmap',
                showlabels = True,
                labelfont = dict( 
                    size = 14,
                    color = 'blue',
                )
            ),
            showscale=False,
        ),
    )

fig.update_annotations(font_size=24)
_layout = layout.copy()
_layout["width"] = 600
_layout["height"] = 600
fig.update_layout(go.Layout(_layout))

fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')

In [128]:
previous = False
retry=False
# retry = True
iterations = 500

#initialization
assignments = np.zeros(shape=(xs.shape[0], len(pi)))

ll_last_component = np.zeros(shape=assignments.shape)
lambdas = np.zeros(shape=(iterations+1, NUM_CLUSTERS)); lambdas[0] = given_lambdas.copy()
log_lls = []
for it in range(iterations):
    print(f"iteration: {it+1} | {current_lambdas}"+" "*25, end='\r' if it<iterations-1 else '\n')
    xvec = xs[:, 0]

    # e-step
    gammas = []
    for k in range(NUM_CLUSTERS):
        gammas.append(scipy.stats.gamma.pdf(lambdas[it, k], NU))
        for i, x in enumerate(xvec):
            fx_theta = scipy.stats.poisson.pmf(x, lambdas[it, k])
            assignments[i, k] = pi[k]*fx_theta
            ll_last_component[i, k] = pi[k]*fx_theta

    assignments = (assignments.T/assignments.sum(axis=1)).T

    # m-step
    K, N = NUM_CLUSTERS, xs.shape[0]
    for k in range(NUM_CLUSTERS):
        assignments_k_sum = assignments[:, k].sum()
        pi[k] = assignments_k_sum / N
        lambdas[it+1, k] = ((assignments[:, k]*xvec).sum()) / (assignments_k_sum)

iteration: 500 | [1.23139037 3.88884367]                         


In [133]:
fig = go.Figure()
fig.add_trace(
    go.Contour(
        z = z,
        showlegend=False,
        x=points,
        y=points,
        colorscale='Hot',
        contours=dict(
            coloring ='heatmap',
            showlabels = True,
            labelfont = dict( 
                size = 14,
                color = 'blue',
            )
        ),
        showscale=False,
    ),
)

fig.add_trace(
    go.Scatter(
        x = [x for x, y in lambdas],
        y = [y for x, y in lambdas],
        mode="lines"
    )
)

r = 0.05
last_pt = lambdas[-1]
fig.add_shape(
    type="circle",
    xref="x", yref="y",
    x0 = last_pt[0]-r, y0 = last_pt[1]-r,
    x1 = last_pt[0]+r, y1 = last_pt[1]+r,
    fillcolor="red",
    line_color="red",
)

fig.update_annotations(font_size=24)
_layout = layout.copy()
_layout["width"] = 600
_layout["height"] = 600
fig.update_layout(go.Layout(_layout))

fig.show()
# from IPython.display import Image
# Image(filename='q5_FCGR3A.png')